Merge remote-tracking branch 'origin/master' into use-iobject-storage-for-table-engines-1

This commit is contained in:
kssenii 2024-05-23 16:51:46 +02:00
commit 3004f0b63d
309 changed files with 9911 additions and 935 deletions

View File

@ -42,25 +42,25 @@ At a minimum, the following information should be added (but add more as needed)
> Information about CI checks: https://clickhouse.com/docs/en/development/continuous-integration/
<details>
<summary>Modify your CI run</summary>
<summary>CI Settings</summary>
**NOTE:** If your merge the PR with modified CI you **MUST KNOW** what you are doing
**NOTE:** Checked options will be applied if set before CI RunConfig/PrepareRunConfig step
#### Include tests (required builds will be added automatically):
- [ ] <!---ci_include_fast--> Fast test
#### Run these jobs only (required builds will be added automatically):
- [ ] <!---ci_include_integration--> Integration Tests
- [ ] <!---ci_include_stateless--> Stateless tests
- [ ] <!---ci_include_stateful--> Stateful tests
- [ ] <!---ci_include_unit--> Unit tests
- [ ] <!---ci_include_performance--> Performance tests
- [ ] <!---ci_include_aarch64--> All with aarch64
- [ ] <!---ci_include_asan--> All with ASAN
- [ ] <!---ci_include_tsan--> All with TSAN
- [ ] <!---ci_include_analyzer--> All with Analyzer
- [ ] <!---ci_include_azure --> All with Azure
- [ ] <!---ci_include_KEYWORD--> Add your option here
#### Exclude tests:
#### Deny these jobs:
- [ ] <!---ci_exclude_fast--> Fast test
- [ ] <!---ci_exclude_integration--> Integration Tests
- [ ] <!---ci_exclude_stateless--> Stateless tests
@ -72,7 +72,6 @@ At a minimum, the following information should be added (but add more as needed)
- [ ] <!---ci_exclude_ubsan--> All with UBSAN
- [ ] <!---ci_exclude_coverage--> All with Coverage
- [ ] <!---ci_exclude_aarch64--> All with Aarch64
- [ ] <!---ci_exclude_KEYWORD--> Add your option here
#### Extra options:
- [ ] <!---do_not_test--> do not test (only style check)

View File

@ -22,6 +22,9 @@ jobs:
clear-repository: true # to ensure correct digests
fetch-depth: 0 # to get version
filter: tree:0
- name: Cancel PR workflow
run: |
python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --cancel-previous-run
- name: Python unit tests
run: |
cd "$GITHUB_WORKSPACE/tests/ci"

View File

@ -2,11 +2,11 @@
# NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
# only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
SET(VERSION_REVISION 54486)
SET(VERSION_REVISION 54487)
SET(VERSION_MAJOR 24)
SET(VERSION_MINOR 5)
SET(VERSION_MINOR 6)
SET(VERSION_PATCH 1)
SET(VERSION_GITHASH 6d4b31322d168356c8b10c43b4cef157c82337ff)
SET(VERSION_DESCRIBE v24.5.1.1-testing)
SET(VERSION_STRING 24.5.1.1)
SET(VERSION_GITHASH 70a1d3a63d47f0be077d67b8deb907230fc7cfb0)
SET(VERSION_DESCRIBE v24.6.1.1-testing)
SET(VERSION_STRING 24.6.1.1)
# end of autochange

View File

@ -11,6 +11,7 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \
aspell \
curl \
git \
gh \
file \
libxml2-utils \
moreutils \

View File

@ -197,6 +197,7 @@ SELECT * FROM nestedt FORMAT TSV
- [input_format_tsv_enum_as_number](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_enum_as_number) - treat inserted enum values in TSV formats as enum indices. Default value - `false`.
- [input_format_tsv_use_best_effort_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_use_best_effort_in_schema_inference) - use some tweaks and heuristics to infer schema in TSV format. If disabled, all fields will be inferred as Strings. Default value - `true`.
- [output_format_tsv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_tsv_crlf_end_of_line) - if it is set true, end of line in TSV output format will be `\r\n` instead of `\n`. Default value - `false`.
- [input_format_tsv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_crlf_end_of_line) - if it is set true, end of line in TSV input format will be `\r\n` instead of `\n`. Default value - `false`.
- [input_format_tsv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_first_lines) - skip specified number of lines at the beginning of data. Default value - `0`.
- [input_format_tsv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_detect_header) - automatically detect header with names and types in TSV format. Default value - `true`.
- [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.

View File

@ -561,6 +561,25 @@ Default value: 5000
<max_table_num_to_warn>400</max_table_num_to_warn>
```
## max\_view\_num\_to\_warn {#max-view-num-to-warn}
If the number of attached views exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table.
Default value: 10000
**Example**
``` xml
<max_view_num_to_warn>400</max_view_num_to_warn>
```
## max\_dictionary\_num\_to\_warn {#max-dictionary-num-to-warn}
If the number of attached dictionaries exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table.
Default value: 1000
**Example**
``` xml
<max_dictionary_num_to_warn>400</max_dictionary_num_to_warn>
```
## max\_part\_num\_to\_warn {#max-part-num-to-warn}
If the number of active parts exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table.

View File

@ -831,7 +831,13 @@ Default value: `0`.
### output_format_tsv_crlf_end_of_line {#output_format_tsv_crlf_end_of_line}
Use DOC/Windows-style line separator (CRLF) in TSV instead of Unix style (LF).
Use DOS/Windows-style line separator (CRLF) in TSV instead of Unix style (LF).
Disabled by default.
### input_format_tsv_crlf_end_of_line {#input_format_tsv_crlf_end_of_line}
Use DOS/Windows-style line separator (CRLF) for TSV input files instead of Unix style (LF).
Disabled by default.

View File

@ -421,6 +421,7 @@ Other parameters:
* `skip_access_check` - If true, disk access checks will not be performed on disk start-up. Default value is `false`.
* `read_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of read requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk).
* `write_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of write requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk).
* `metadata_keep_free_space_bytes` - the amount of free metadata disk space to be reserved.
Examples of working configurations can be found in integration tests directory (see e.g. [test_merge_tree_azure_blob_storage](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/storage_conf.xml) or [test_azure_blob_storage_zero_copy_replication](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_azure_blob_storage_zero_copy_replication/configs/config.d/storage_conf.xml)).

View File

@ -0,0 +1,495 @@
---
slug: /en/sql-reference/data-types/dynamic
sidebar_position: 56
sidebar_label: Dynamic
---
# Dynamic
This type allows to store values of any type inside it without knowing all of them in advance.
To declare a column of `Dynamic` type, use the following syntax:
``` sql
<column_name> Dynamic(max_types=N)
```
Where `N` is an optional parameter between `1` and `255` indicating how many different data types can be stored inside a column with type `Dynamic` across single block of data that is stored separately (for example across single data part for MergeTree table). If this limit is exceeded, all new types will be converted to type `String`. Default value of `max_types` is `32`.
:::note
The Dynamic data type is an experimental feature. To use it, set `allow_experimental_dynamic_type = 1`.
:::
## Creating Dynamic
Using `Dynamic` type in table column definition:
```sql
CREATE TABLE test (d Dynamic) ENGINE = Memory;
INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]);
SELECT d, dynamicType(d) FROM test;
```
```text
┌─d─────────────┬─dynamicType(d)─┐
│ ᴺᵁᴸᴸ │ None │
│ 42 │ Int64 │
│ Hello, World! │ String │
│ [1,2,3] │ Array(Int64) │
└───────────────┴────────────────┘
```
Using CAST from ordinary column:
```sql
SELECT 'Hello, World!'::Dynamic as d, dynamicType(d);
```
```text
┌─d─────────────┬─dynamicType(d)─┐
│ Hello, World! │ String │
└───────────────┴────────────────┘
```
Using CAST from `Variant` column:
```sql
SET allow_experimental_variant_type = 1, use_variant_as_common_type = 1;
SELECT multiIf((number % 3) = 0, number, (number % 3) = 1, range(number + 1), NULL)::Dynamic AS d, dynamicType(d) FROM numbers(3)
```
```text
┌─d─────┬─dynamicType(d)─┐
│ 0 │ UInt64 │
│ [0,1] │ Array(UInt64) │
│ ᴺᵁᴸᴸ │ None │
└───────┴────────────────┘
```
## Reading Dynamic nested types as subcolumns
`Dynamic` type supports reading a single nested type from a `Dynamic` column using the type name as a subcolumn.
So, if you have column `d Dynamic` you can read a subcolumn of any valid type `T` using syntax `d.T`,
this subcolumn will have type `Nullable(T)` if `T` can be inside `Nullable` and `T` otherwise. This subcolumn will
be the same size as original `Dynamic` column and will contain `NULL` values (or empty values if `T` cannot be inside `Nullable`)
in all rows in which original `Dynamic` column doesn't have type `T`.
`Dynamic` subcolumns can be also read using function `dynamicElement(dynamic_column, type_name)`.
Examples:
```sql
CREATE TABLE test (d Dynamic) ENGINE = Memory;
INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]);
SELECT d, dynamicType(d), d.String, d.Int64, d.`Array(Int64)`, d.Date, d.`Array(String)` FROM test;
```
```text
┌─d─────────────┬─dynamicType(d)─┬─d.String──────┬─d.Int64─┬─d.Array(Int64)─┬─d.Date─┬─d.Array(String)─┐
│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │
│ 42 │ Int64 │ ᴺᵁᴸᴸ │ 42 │ [] │ ᴺᵁᴸᴸ │ [] │
│ Hello, World! │ String │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │
│ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ ᴺᵁᴸᴸ │ [] │
└───────────────┴────────────────┴───────────────┴─────────┴────────────────┴────────┴─────────────────┘
```
```sql
SELECT toTypeName(d.String), toTypeName(d.Int64), toTypeName(d.`Array(Int64)`), toTypeName(d.Date), toTypeName(d.`Array(String)`) FROM test LIMIT 1;
```
```text
┌─toTypeName(d.String)─┬─toTypeName(d.Int64)─┬─toTypeName(d.Array(Int64))─┬─toTypeName(d.Date)─┬─toTypeName(d.Array(String))─┐
│ Nullable(String) │ Nullable(Int64) │ Array(Int64) │ Nullable(Date) │ Array(String) │
└──────────────────────┴─────────────────────┴────────────────────────────┴────────────────────┴─────────────────────────────┘
```
```sql
SELECT d, dynamicType(d), dynamicElement(d, 'String'), dynamicElement(d, 'Int64'), dynamicElement(d, 'Array(Int64)'), dynamicElement(d, 'Date'), dynamicElement(d, 'Array(String)') FROM test;```
```
```text
┌─d─────────────┬─dynamicType(d)─┬─dynamicElement(d, 'String')─┬─dynamicElement(d, 'Int64')─┬─dynamicElement(d, 'Array(Int64)')─┬─dynamicElement(d, 'Date')─┬─dynamicElement(d, 'Array(String)')─┐
│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │
│ 42 │ Int64 │ ᴺᵁᴸᴸ │ 42 │ [] │ ᴺᵁᴸᴸ │ [] │
│ Hello, World! │ String │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │
│ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ ᴺᵁᴸᴸ │ [] │
└───────────────┴────────────────┴─────────────────────────────┴────────────────────────────┴───────────────────────────────────┴───────────────────────────┴────────────────────────────────────┘
```
To know what variant is stored in each row function `dynamicType(dynamic_column)` can be used. It returns `String` with value type name for each row (or `'None'` if row is `NULL`).
Example:
```sql
CREATE TABLE test (d Dynamic) ENGINE = Memory;
INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]);
SELECT dynamicType(d) from test;
```
```text
┌─dynamicType(d)─┐
│ None │
│ Int64 │
│ String │
│ Array(Int64) │
└────────────────┘
```
## Conversion between Dynamic column and other columns
There are 4 possible conversions that can be performed with `Dynamic` column.
### Converting an ordinary column to a Dynamic column
```sql
SELECT 'Hello, World!'::Dynamic as d, dynamicType(d);
```
```text
┌─d─────────────┬─dynamicType(d)─┐
│ Hello, World! │ String │
└───────────────┴────────────────┘
```
### Converting a String column to a Dynamic column through parsing
To parse `Dynamic` type values from a `String` column you can enable setting `cast_string_to_dynamic_use_inference`:
```sql
SET cast_string_to_dynamic_use_inference = 1;
SELECT CAST(materialize(map('key1', '42', 'key2', 'true', 'key3', '2020-01-01')), 'Map(String, Dynamic)') as map_of_dynamic, mapApply((k, v) -> (k, dynamicType(v)), map_of_dynamic) as map_of_dynamic_types;
```
```text
┌─map_of_dynamic──────────────────────────────┬─map_of_dynamic_types─────────────────────────┐
│ {'key1':42,'key2':true,'key3':'2020-01-01'} │ {'key1':'Int64','key2':'Bool','key3':'Date'} │
└─────────────────────────────────────────────┴──────────────────────────────────────────────┘
```
### Converting a Dynamic column to an ordinary column
It is possible to convert a `Dynamic` column to an ordinary column. In this case all nested types will be converted to a destination type:
```sql
CREATE TABLE test (d Dynamic) ENGINE = Memory;
INSERT INTO test VALUES (NULL), (42), ('42.42'), (true), ('e10');
SELECT d::Nullable(Float64) FROM test;
```
```text
┌─CAST(d, 'Nullable(Float64)')─┐
│ ᴺᵁᴸᴸ │
│ 42 │
│ 42.42 │
│ 1 │
│ 0 │
└──────────────────────────────┘
```
### Converting a Variant column to Dynamic column
```sql
CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory;
INSERT INTO test VALUES (NULL), (42), ('String'), ([1, 2, 3]);
SELECT v::Dynamic as d, dynamicType(d) from test;
```
```text
┌─d───────┬─dynamicType(d)─┐
│ ᴺᵁᴸᴸ │ None │
│ 42 │ UInt64 │
│ String │ String │
│ [1,2,3] │ Array(UInt64) │
└─────────┴────────────────┘
```
### Converting a Dynamic(max_types=N) column to another Dynamic(max_types=K)
If `K >= N` than during conversion the data doesn't change:
```sql
CREATE TABLE test (d Dynamic(max_types=3)) ENGINE = Memory;
INSERT INTO test VALUES (NULL), (42), (43), ('42.42'), (true);
SELECT d::Dynamic(max_types=5) as d2, dynamicType(d2) FROM test;
```
```text
┌─d─────┬─dynamicType(d)─┐
│ ᴺᵁᴸᴸ │ None │
│ 42 │ Int64 │
│ 43 │ Int64 │
│ 42.42 │ String │
│ true │ Bool │
└───────┴────────────────┘
```
If `K < N`, then the values with the rarest types are converted to `String`:
```text
CREATE TABLE test (d Dynamic(max_types=4)) ENGINE = Memory;
INSERT INTO test VALUES (NULL), (42), (43), ('42.42'), (true), ([1, 2, 3]);
SELECT d, dynamicType(d), d::Dynamic(max_types=2) as d2, dynamicType(d2) FROM test;
```
```text
┌─d───────┬─dynamicType(d)─┬─d2──────┬─dynamicType(d2)─┐
│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ None │
│ 42 │ Int64 │ 42 │ Int64 │
│ 43 │ Int64 │ 43 │ Int64 │
│ 42.42 │ String │ 42.42 │ String │
│ true │ Bool │ true │ String │
│ [1,2,3] │ Array(Int64) │ [1,2,3] │ String │
└─────────┴────────────────┴─────────┴─────────────────┘
```
If `K=1`, all types are converted to `String`:
```text
CREATE TABLE test (d Dynamic(max_types=4)) ENGINE = Memory;
INSERT INTO test VALUES (NULL), (42), (43), ('42.42'), (true), ([1, 2, 3]);
SELECT d, dynamicType(d), d::Dynamic(max_types=1) as d2, dynamicType(d2) FROM test;
```
```text
┌─d───────┬─dynamicType(d)─┬─d2──────┬─dynamicType(d2)─┐
│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ None │
│ 42 │ Int64 │ 42 │ String │
│ 43 │ Int64 │ 43 │ String │
│ 42.42 │ String │ 42.42 │ String │
│ true │ Bool │ true │ String │
│ [1,2,3] │ Array(Int64) │ [1,2,3] │ String │
└─────────┴────────────────┴─────────┴─────────────────┘
```
## Reading Dynamic type from the data
All text formats (TSV, CSV, CustomSeparated, Values, JSONEachRow, etc) supports reading `Dynamic` type. During data parsing ClickHouse tries to infer the type of each value and use it during insertion to `Dynamic` column.
Example:
```sql
SELECT
d,
dynamicType(d),
dynamicElement(d, 'String') AS str,
dynamicElement(d, 'Int64') AS num,
dynamicElement(d, 'Float64') AS float,
dynamicElement(d, 'Date') AS date,
dynamicElement(d, 'Array(Int64)') AS arr
FROM format(JSONEachRow, 'd Dynamic', $$
{"d" : "Hello, World!"},
{"d" : 42},
{"d" : 42.42},
{"d" : "2020-01-01"},
{"d" : [1, 2, 3]}
$$)
```
```text
┌─d─────────────┬─dynamicType(d)─┬─str───────────┬──num─┬─float─┬───────date─┬─arr─────┐
│ Hello, World! │ String │ Hello, World! │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │
│ 42 │ Int64 │ ᴺᵁᴸᴸ │ 42 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │
│ 42.42 │ Float64 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 42.42 │ ᴺᵁᴸᴸ │ [] │
│ 2020-01-01 │ Date │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2020-01-01 │ [] │
│ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │
└───────────────┴────────────────┴───────────────┴──────┴───────┴────────────┴─────────┘
```
## Comparing values of Dynamic type
Values of `Dynamic` types are compared similar to values of `Variant` type:
The result of operator `<` for values `d1` with underlying type `T1` and `d2` with underlying type `T2` of a type `Dynamic` is defined as follows:
- If `T1 = T2 = T`, the result will be `d1.T < d2.T` (underlying values will be compared).
- If `T1 != T2`, the result will be `T1 < T2` (type names will be compared).
Examples:
```sql
CREATE TABLE test (d1 Dynamic, d2 Dynamic) ENGINE=Memory;
INSERT INTO test VALUES (42, 42), (42, 43), (42, 'abc'), (42, [1, 2, 3]), (42, []), (42, NULL);
```
```sql
SELECT d2, dynamicType(d2) as d2_type from test order by d2;
```
```text
┌─d2──────┬─d2_type──────┐
│ [] │ Array(Int64) │
│ [1,2,3] │ Array(Int64) │
│ 42 │ Int64 │
│ 43 │ Int64 │
│ abc │ String │
│ ᴺᵁᴸᴸ │ None │
└─────────┴──────────────┘
```
```sql
SELECT d1, dynamicType(d1) as d1_type, d2, dynamicType(d2) as d2_type, d1 = d2, d1 < d2, d1 > d2 from test;
```
```text
┌─d1─┬─d1_type─┬─d2──────┬─d2_type──────┬─equals(d1, d2)─┬─less(d1, d2)─┬─greater(d1, d2)─┐
│ 42 │ Int64 │ 42 │ Int64 │ 1 │ 0 │ 0 │
│ 42 │ Int64 │ 43 │ Int64 │ 0 │ 1 │ 0 │
│ 42 │ Int64 │ abc │ String │ 0 │ 1 │ 0 │
│ 42 │ Int64 │ [1,2,3] │ Array(Int64) │ 0 │ 0 │ 1 │
│ 42 │ Int64 │ [] │ Array(Int64) │ 0 │ 0 │ 1 │
│ 42 │ Int64 │ ᴺᵁᴸᴸ │ None │ 0 │ 1 │ 0 │
└────┴─────────┴─────────┴──────────────┴────────────────┴──────────────┴─────────────────┘
```
If you need to find the row with specific `Dynamic` value, you can do one of the following:
- Cast value to the `Dynamic` type:
```sql
SELECT * FROM test WHERE d2 == [1,2,3]::Array(UInt32)::Dynamic;
```
```text
┌─d1─┬─d2──────┐
│ 42 │ [1,2,3] │
└────┴─────────┘
```
- Compare `Dynamic` subcolumn with required type:
```sql
SELECT * FROM test WHERE d2.`Array(Int65)` == [1,2,3] -- or using variantElement(d2, 'Array(UInt32)')
```
```text
┌─d1─┬─d2──────┐
│ 42 │ [1,2,3] │
└────┴─────────┘
```
Sometimes it can be useful to make additional check on dynamic type as subcolumns with complex types like `Array/Map/Tuple` cannot be inside `Nullable` and will have default values instead of `NULL` on rows with different types:
```sql
SELECT d2, d2.`Array(Int64)`, dynamicType(d2) FROM test WHERE d2.`Array(Int64)` == [];
```
```text
┌─d2───┬─d2.Array(UInt32)─┬─dynamicType(d2)─┐
│ 42 │ [] │ Int64 │
│ 43 │ [] │ Int64 │
│ abc │ [] │ String │
│ [] │ [] │ Array(Int32) │
│ ᴺᵁᴸᴸ │ [] │ None │
└──────┴──────────────────┴─────────────────┘
```
```sql
SELECT d2, d2.`Array(Int64)`, dynamicType(d2) FROM test WHERE dynamicType(d2) == 'Array(Int64)' AND d2.`Array(Int64)` == [];
```
```text
┌─d2─┬─d2.Array(UInt32)─┬─dynamicType(d2)─┐
│ [] │ [] │ Array(Int64) │
└────┴──────────────────┴─────────────────┘
```
**Note:** values of dynamic types with different numeric types are considered as different values and not compared between each other, their type names are compared instead.
Example:
```sql
CREATE TABLE test (d Dynamic) ENGINE=Memory;
INSERT INTO test VALUES (1::UInt32), (1::Int64), (100::UInt32), (100::Int64);
SELECT d, dynamicType(d) FROM test ORDER by d;
```
```text
┌─v───┬─dynamicType(v)─┐
│ 1 │ Int64 │
│ 100 │ Int64 │
│ 1 │ UInt32 │
│ 100 │ UInt32 │
└─────┴────────────────┘
```
## Reaching the limit in number of different data types stored inside Dynamic
`Dynamic` data type can store only limited number of different data types inside. By default, this limit is 32, but you can change it in type declaration using syntax `Dynamic(max_types=N)` where N is between 1 and 255 (due to implementation details, it's impossible to have more than 255 different data types inside Dynamic).
When the limit is reached, all new data types inserted to `Dynamic` column will be casted to `String` and stored as `String` values.
Let's see what happens when the limit is reached in different scenarios.
### Reaching the limit during data parsing
During parsing of `Dynamic` values from the data, when the limit is reached for current block of data, all new values will be inserted as `String` values:
```sql
SELECT d, dynamicType(d) FROM format(JSONEachRow, 'd Dynamic(max_types=3)', '
{"d" : 42}
{"d" : [1, 2, 3]}
{"d" : "Hello, World!"}
{"d" : "2020-01-01"}
{"d" : ["str1", "str2", "str3"]}
{"d" : {"a" : 1, "b" : [1, 2, 3]}}
')
```
```text
┌─d──────────────────────────┬─dynamicType(d)─┐
│ 42 │ Int64 │
│ [1,2,3] │ Array(Int64) │
│ Hello, World! │ String │
│ 2020-01-01 │ String │
│ ["str1", "str2", "str3"] │ String │
│ {"a" : 1, "b" : [1, 2, 3]} │ String │
└────────────────────────────┴────────────────┘
```
As we can see, after inserting 3 different data types `Int64`, `Array(Int64)` and `String` all new types were converted to `String`.
### During merges of data parts in MergeTree table engines
During merge of several data parts in MergeTree table the `Dynamic` column in the resulting data part can reach the limit of different data types inside and won't be able to store all types from source parts.
In this case ClickHouse chooses what types will remain after merge and what types will be casted to `String`. In most cases ClickHouse tries to keep the most frequent types and cast the rarest types to `String`, but it depends on the implementation.
Let's see an example of such merge. First, let's create a table with `Dynamic` column, set the limit of different data types to `3` and insert values with `5` different types:
```sql
CREATE TABLE test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree ORDER BY id;
SYSTEM STOP MERGES test;
INSERT INTO test SELECT number, number FROM numbers(5);
INSERT INTO test SELECT number, range(number) FROM numbers(4);
INSERT INTO test SELECT number, toDate(number) FROM numbers(3);
INSERT INTO test SELECT number, map(number, number) FROM numbers(2);
INSERT INTO test SELECT number, 'str_' || toString(number) FROM numbers(1);
```
Each insert will create a separate data pert with `Dynamic` column containing single type:
```sql
SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) ORDER BY _part;
```
```text
┌─count()─┬─dynamicType(d)──────┬─_part─────┐
│ 5 │ UInt64 │ all_1_1_0 │
│ 4 │ Array(UInt64) │ all_2_2_0 │
│ 3 │ Date │ all_3_3_0 │
│ 2 │ Map(UInt64, UInt64) │ all_4_4_0 │
│ 1 │ String │ all_5_5_0 │
└─────────┴─────────────────────┴───────────┘
```
Now, let's merge all parts into one and see what will happen:
```sql
SYSTEM START MERGES test;
OPTIMIZE TABLE test FINAL;
SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) ORDER BY _part;
```
```text
┌─count()─┬─dynamicType(d)─┬─_part─────┐
│ 5 │ UInt64 │ all_1_5_2 │
│ 6 │ String │ all_1_5_2 │
│ 4 │ Array(UInt64) │ all_1_5_2 │
└─────────┴────────────────┴───────────┘
```
As we can see, ClickHouse kept the most frequent types `UInt64` and `Array(UInt64)` and casted all other types to `String`.

View File

@ -947,3 +947,49 @@ Result:
│ 11 │
└──────────────────────────────────┘
```
## proportionsZTest
Returns test statistics for the two proportion Z-test - a statistical test for comparing the proportions from two populations `x` and `y`.
**Syntax**
```sql
proportionsZTest(successes_x, successes_y, trials_x, trials_y, conf_level, pool_type)
```
**Arguments**
- `successes_x`: Number of successes in population `x`. [UInt64](../data-types/int-uint.md).
- `successes_y`: Number of successes in population `y`. [UInt64](../data-types/int-uint.md).
- `trials_x`: Number of trials in population `x`. [UInt64](../data-types/int-uint.md).
- `trials_y`: Number of trials in population `y`. [UInt64](../data-types/int-uint.md).
- `conf_level`: Confidence level for the test. [Float64](../data-types/float.md).
- `pool_type`: Selection of pooling (way in which the standard error is estimated). Can be either `unpooled` or `pooled`. [String](../data-types/string.md).
:::note
For argument `pool_type`: In the pooled version, the two proportions are averaged, and only one proportion is used to estimate the standard error. In the unpooled version, the two proportions are used separately.
:::
**Returned value**
- `z_stat`: Z statistic. [Float64](../data-types/float.md).
- `p_val`: P value. [Float64](../data-types/float.md).
- `ci_low`: The lower confidence interval. [Float64](../data-types/float.md).
- `ci_high`: The upper confidence interval. [Float64](../data-types/float.md).
**Example**
Query:
```sql
SELECT proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled');
```
Result:
```response
┌─proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled')───────────────────────────────┐
│ (-0.20656724435948853,0.8363478437079654,-0.09345975390115283,0.07563797172293502) │
└────────────────────────────────────────────────────────────────────────────────────┘
```

View File

@ -119,6 +119,7 @@ Hello\nworld
Hello\
world
```
`\n\r` (CRLF) поддерживается с помощью настройки `input_format_tsv_crlf_end_of_line`.
Второй вариант поддерживается, так как его использует MySQL при записи tab-separated дампа.

View File

@ -1178,7 +1178,7 @@ void Client::processConfig()
pager = config().getString("pager", "");
setDefaultFormatsFromConfiguration();
setDefaultFormatsAndCompressionFromConfiguration();
global_context->setClientName(std::string(DEFAULT_CLIENT_NAME));
global_context->setQueryKindInitial();

View File

@ -182,6 +182,11 @@ std::string Keeper::getDefaultConfigFileName() const
return "keeper_config.xml";
}
bool Keeper::allowTextLog() const
{
return false;
}
void Keeper::handleCustomArguments(const std::string & arg, [[maybe_unused]] const std::string & value) // NOLINT
{
if (arg == "force-recovery")

View File

@ -65,6 +65,8 @@ protected:
std::string getDefaultConfigFileName() const override;
bool allowTextLog() const override;
private:
Poco::Net::SocketAddress socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure = false) const;

View File

@ -607,7 +607,7 @@ void LocalServer::processConfig()
if (config().has("macros"))
global_context->setMacros(std::make_unique<Macros>(config(), "macros", log));
setDefaultFormatsFromConfiguration();
setDefaultFormatsAndCompressionFromConfiguration();
/// Sets external authenticators config (LDAP, Kerberos).
global_context->setExternalAuthenticatorsConfig(config());

View File

@ -1476,6 +1476,8 @@ try
global_context->setMaxTableSizeToDrop(new_server_settings.max_table_size_to_drop);
global_context->setMaxPartitionSizeToDrop(new_server_settings.max_partition_size_to_drop);
global_context->setMaxTableNumToWarn(new_server_settings.max_table_num_to_warn);
global_context->setMaxViewNumToWarn(new_server_settings.max_view_num_to_warn);
global_context->setMaxDictionaryNumToWarn(new_server_settings.max_dictionary_num_to_warn);
global_context->setMaxDatabaseNumToWarn(new_server_settings.max_database_num_to_warn);
global_context->setMaxPartNumToWarn(new_server_settings.max_part_num_to_warn);

View File

@ -4617,6 +4617,36 @@ QueryAnalyzer::QueryTreeNodesWithNames QueryAnalyzer::resolveUnqualifiedMatcher(
std::unordered_set<std::string> table_expression_column_names_to_skip;
QueryTreeNodesWithNames result;
if (matcher_node_typed.getMatcherType() == MatcherNodeType::COLUMNS_LIST)
{
auto identifiers = matcher_node_typed.getColumnsIdentifiers();
result.reserve(identifiers.size());
for (const auto & identifier : identifiers)
{
auto resolve_result = tryResolveIdentifier(IdentifierLookup{identifier, IdentifierLookupContext::EXPRESSION}, scope);
if (!resolve_result.isResolved())
throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER,
"Unknown identifier '{}' inside COLUMNS matcher. In scope {}",
identifier.getFullName(), scope.dump());
// TODO: Introduce IdentifierLookupContext::COLUMN and get rid of this check
auto * resolved_column = resolve_result.resolved_identifier->as<ColumnNode>();
if (!resolved_column)
throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER,
"Identifier '{}' inside COLUMNS matcher must resolve into a column, but got {}. In scope {}",
identifier.getFullName(),
resolve_result.resolved_identifier->getNodeTypeName(),
scope.scope_node->formatASTForErrorMessage());
result.emplace_back(resolve_result.resolved_identifier, resolved_column->getColumnName());
}
return result;
}
result.resize(matcher_node_typed.getColumnsIdentifiers().size());
for (auto & table_expression : table_expressions_stack)
{
bool table_expression_in_resolve_process = nearest_query_scope->table_expressions_in_resolve_process.contains(table_expression.get());
@ -4784,8 +4814,6 @@ QueryAnalyzer::QueryTreeNodesWithNames QueryAnalyzer::resolveUnqualifiedMatcher(
table_expressions_column_nodes_with_names_stack.push_back(std::move(matched_column_nodes_with_names));
}
QueryTreeNodesWithNames result;
for (auto & table_expression_column_nodes_with_names : table_expressions_column_nodes_with_names_stack)
{
for (auto && table_expression_column_node_with_name : table_expression_column_nodes_with_names)

View File

@ -21,6 +21,7 @@
#include <Common/StringUtils.h>
#include <Common/filesystemHelpers.h>
#include <Common/NetException.h>
#include <Common/tryGetFileNameByFileDescriptor.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <Formats/FormatFactory.h>
@ -643,6 +644,9 @@ try
bool extras_into_stdout = need_render_progress || logs_into_stdout;
bool select_only_into_file = select_into_file && !select_into_file_and_stdout;
if (!out_file_buf && default_output_compression_method != CompressionMethod::None)
out_file_buf = wrapWriteBufferWithCompressionMethod(out_buf, default_output_compression_method, 3, 0);
/// It is not clear how to write progress and logs
/// intermixed with data with parallel formatting.
/// It may increase code complexity significantly.
@ -735,7 +739,7 @@ bool ClientBase::isRegularFile(int fd)
return fstat(fd, &file_stat) == 0 && S_ISREG(file_stat.st_mode);
}
void ClientBase::setDefaultFormatsFromConfiguration()
void ClientBase::setDefaultFormatsAndCompressionFromConfiguration()
{
if (config().has("output-format"))
{
@ -759,6 +763,10 @@ void ClientBase::setDefaultFormatsFromConfiguration()
default_output_format = *format_from_file_name;
else
default_output_format = "TSV";
std::optional<String> file_name = tryGetFileNameFromFileDescriptor(STDOUT_FILENO);
if (file_name)
default_output_compression_method = chooseCompressionMethod(*file_name, "");
}
else if (is_interactive)
{

View File

@ -190,7 +190,7 @@ protected:
/// Adjust some settings after command line options and config had been processed.
void adjustSettings();
void setDefaultFormatsFromConfiguration();
void setDefaultFormatsAndCompressionFromConfiguration();
void initTTYBuffer(ProgressOption progress);
@ -224,6 +224,7 @@ protected:
String pager;
String default_output_format; /// Query results output format.
CompressionMethod default_output_compression_method = CompressionMethod::None;
String default_input_format; /// Tables' format for clickhouse-local.
bool select_into_file = false; /// If writing result INTO OUTFILE. It affects progress rendering.

View File

@ -1289,4 +1289,14 @@ size_t ColumnArray::getNumberOfDimensions() const
return 1 + nested_array->getNumberOfDimensions(); /// Every modern C++ compiler optimizes tail recursion.
}
void ColumnArray::takeDynamicStructureFromSourceColumns(const Columns & source_columns)
{
Columns nested_source_columns;
nested_source_columns.reserve(source_columns.size());
for (const auto & source_column : source_columns)
nested_source_columns.push_back(assert_cast<const ColumnArray &>(*source_column).getDataPtr());
data->takeDynamicStructureFromSourceColumns(nested_source_columns);
}
}

View File

@ -175,6 +175,9 @@ public:
size_t getNumberOfDimensions() const;
bool hasDynamicStructure() const override { return getData().hasDynamicStructure(); }
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
private:
WrappedPtr data;
WrappedPtr offsets;

View File

@ -122,6 +122,9 @@ public:
UInt64 getNumberOfDefaultRows() const override { throwMustBeDecompressed(); }
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeDecompressed(); }
bool hasDynamicStructure() const override { throwMustBeDecompressed(); }
void takeDynamicStructureFromSourceColumns(const Columns &) override { throwMustBeDecompressed(); }
protected:
size_t rows;
size_t bytes;

View File

@ -306,6 +306,8 @@ public:
T getValue() const { return static_cast<T>(getField().safeGet<T>()); }
bool isCollationSupported() const override { return data->isCollationSupported(); }
bool hasDynamicStructure() const override { return data->hasDynamicStructure(); }
};
ColumnConst::Ptr createColumnConst(const ColumnPtr & column, Field value);

View File

@ -0,0 +1,758 @@
#include <Columns/ColumnDynamic.h>
#include <Columns/ColumnCompressed.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/FieldToDataType.h>
#include <Common/Arena.h>
#include <Common/SipHash.h>
#include <Processors/Transforms/ColumnGathererTransform.h>
#include <Interpreters/castColumn.h>
#include <Common/logger_useful.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int PARAMETER_OUT_OF_BOUND;
}
ColumnDynamic::ColumnDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_)
{
/// Create empty Variant.
variant_info.variant_type = std::make_shared<DataTypeVariant>(DataTypes{});
variant_info.variant_name = variant_info.variant_type->getName();
variant_column = variant_info.variant_type->createColumn();
}
ColumnDynamic::ColumnDynamic(
MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_)
: variant_column(std::move(variant_column_))
, variant_info(variant_info_)
, max_dynamic_types(max_dynamic_types_)
, statistics(statistics_)
{
}
ColumnDynamic::MutablePtr ColumnDynamic::create(MutableColumnPtr variant_column, const DataTypePtr & variant_type, size_t max_dynamic_types_, const Statistics & statistics_)
{
VariantInfo variant_info;
variant_info.variant_type = variant_type;
variant_info.variant_name = variant_type->getName();
const auto & variants = assert_cast<const DataTypeVariant &>(*variant_type).getVariants();
variant_info.variant_names.reserve(variants.size());
variant_info.variant_name_to_discriminator.reserve(variants.size());
for (ColumnVariant::Discriminator discr = 0; discr != variants.size(); ++discr)
{
const auto & variant_name = variant_info.variant_names.emplace_back(variants[discr]->getName());
variant_info.variant_name_to_discriminator[variant_name] = discr;
}
return create(std::move(variant_column), variant_info, max_dynamic_types_, statistics_);
}
bool ColumnDynamic::addNewVariant(const DB::DataTypePtr & new_variant)
{
/// Check if we already have such variant.
if (variant_info.variant_name_to_discriminator.contains(new_variant->getName()))
return true;
/// Check if we reached maximum number of variants.
if (variant_info.variant_names.size() >= max_dynamic_types)
{
/// ColumnDynamic can have max_dynamic_types number of variants only when it has String as a variant.
/// Otherwise we won't be able to cast new variants to Strings.
if (!variant_info.variant_name_to_discriminator.contains("String"))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Maximum number of variants reached, but no String variant exists");
return false;
}
/// If we have (max_dynamic_types - 1) number of variants and don't have String variant, we can add only String variant.
if (variant_info.variant_names.size() == max_dynamic_types - 1 && new_variant->getName() != "String" && !variant_info.variant_name_to_discriminator.contains("String"))
return false;
const DataTypes & current_variants = assert_cast<const DataTypeVariant &>(*variant_info.variant_type).getVariants();
DataTypes all_variants = current_variants;
all_variants.push_back(new_variant);
auto new_variant_type = std::make_shared<DataTypeVariant>(all_variants);
updateVariantInfoAndExpandVariantColumn(new_variant_type);
return true;
}
void ColumnDynamic::addStringVariant()
{
if (!addNewVariant(std::make_shared<DataTypeString>()))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot add String variant to Dynamic column, it's a bug");
}
void ColumnDynamic::updateVariantInfoAndExpandVariantColumn(const DB::DataTypePtr & new_variant_type)
{
const DataTypes & current_variants = assert_cast<const DataTypeVariant *>(variant_info.variant_type.get())->getVariants();
const DataTypes & new_variants = assert_cast<const DataTypeVariant *>(new_variant_type.get())->getVariants();
Names new_variant_names;
new_variant_names.reserve(new_variants.size());
std::unordered_map<String, ColumnVariant::Discriminator> new_variant_name_to_discriminator;
new_variant_name_to_discriminator.reserve(new_variants.size());
std::vector<std::pair<MutableColumnPtr, ColumnVariant::Discriminator>> new_variant_columns_and_discriminators_to_add;
new_variant_columns_and_discriminators_to_add.reserve(new_variants.size() - current_variants.size());
std::vector<ColumnVariant::Discriminator> current_to_new_discriminators;
current_to_new_discriminators.resize(current_variants.size());
for (ColumnVariant::Discriminator discr = 0; discr != new_variants.size(); ++discr)
{
const auto & name = new_variant_names.emplace_back(new_variants[discr]->getName());
new_variant_name_to_discriminator[name] = discr;
auto current_it = variant_info.variant_name_to_discriminator.find(name);
if (current_it == variant_info.variant_name_to_discriminator.end())
new_variant_columns_and_discriminators_to_add.emplace_back(new_variants[discr]->createColumn(), discr);
else
current_to_new_discriminators[current_it->second] = discr;
}
variant_info.variant_type = new_variant_type;
variant_info.variant_name = new_variant_type->getName();
variant_info.variant_names = new_variant_names;
variant_info.variant_name_to_discriminator = new_variant_name_to_discriminator;
assert_cast<ColumnVariant &>(*variant_column).extend(current_to_new_discriminators, std::move(new_variant_columns_and_discriminators_to_add));
/// Clear mappings cache because now with new Variant we will have new mappings.
variant_mappings_cache.clear();
}
std::vector<ColumnVariant::Discriminator> * ColumnDynamic::combineVariants(const DB::ColumnDynamic::VariantInfo & other_variant_info)
{
/// Check if we already have global discriminators mapping for other Variant in cache.
/// It's used to not calculate the same mapping each call of insertFrom with the same columns.
auto cache_it = variant_mappings_cache.find(other_variant_info.variant_name);
if (cache_it != variant_mappings_cache.end())
return &cache_it->second;
/// Check if we already tried to combine these variants but failed due to max_dynamic_types limit.
if (variants_with_failed_combination.contains(other_variant_info.variant_name))
return nullptr;
const DataTypes & other_variants = assert_cast<const DataTypeVariant &>(*other_variant_info.variant_type).getVariants();
size_t num_new_variants = 0;
for (size_t i = 0; i != other_variants.size(); ++i)
{
if (!variant_info.variant_name_to_discriminator.contains(other_variant_info.variant_names[i]))
++num_new_variants;
}
/// If we have new variants we need to update current variant info and extend Variant column
if (num_new_variants)
{
const DataTypes & current_variants = assert_cast<const DataTypeVariant &>(*variant_info.variant_type).getVariants();
/// We cannot combine Variants if total number of variants exceeds max_dynamic_types.
if (current_variants.size() + num_new_variants > max_dynamic_types)
{
/// Remember that we cannot combine our variant with this one, so we will not try to do it again.
variants_with_failed_combination.insert(other_variant_info.variant_name);
return nullptr;
}
/// We cannot combine Variants if total number of variants reaches max_dynamic_types and we don't have String variant.
if (current_variants.size() + num_new_variants == max_dynamic_types && !variant_info.variant_name_to_discriminator.contains("String") && !other_variant_info.variant_name_to_discriminator.contains("String"))
{
variants_with_failed_combination.insert(other_variant_info.variant_name);
return nullptr;
}
DataTypes all_variants = current_variants;
all_variants.insert(all_variants.end(), other_variants.begin(), other_variants.end());
auto new_variant_type = std::make_shared<DataTypeVariant>(all_variants);
updateVariantInfoAndExpandVariantColumn(new_variant_type);
}
/// Create a global discriminators mapping for other variant.
std::vector<ColumnVariant::Discriminator> other_to_new_discriminators;
other_to_new_discriminators.reserve(other_variants.size());
for (size_t i = 0; i != other_variants.size(); ++i)
other_to_new_discriminators.push_back(variant_info.variant_name_to_discriminator[other_variant_info.variant_names[i]]);
/// Save mapping to cache to not calculate it again for the same Variants.
auto [it, _] = variant_mappings_cache.emplace(other_variant_info.variant_name, std::move(other_to_new_discriminators));
return &it->second;
}
void ColumnDynamic::insert(const DB::Field & x)
{
/// Check if we can insert field without Variant extension.
if (variant_column->tryInsert(x))
return;
/// If we cannot insert field into current variant column, extend it with new variant for this field from its type.
if (addNewVariant(applyVisitor(FieldToDataType(), x)))
{
/// Now we should be able to insert this field into extended variant column.
variant_column->insert(x);
}
else
{
/// We reached maximum number of variants and couldn't add new variant.
/// This case should be really rare in real use cases.
/// We should always be able to add String variant and cast inserted value to String.
addStringVariant();
variant_column->insert(toString(x));
}
}
bool ColumnDynamic::tryInsert(const DB::Field & x)
{
/// We can insert any value into Dynamic column.
insert(x);
return true;
}
void ColumnDynamic::insertFrom(const DB::IColumn & src_, size_t n)
{
const auto & dynamic_src = assert_cast<const ColumnDynamic &>(src_);
/// Check if we have the same variants in both columns.
if (variant_info.variant_name == dynamic_src.variant_info.variant_name)
{
variant_column->insertFrom(*dynamic_src.variant_column, n);
return;
}
auto & variant_col = assert_cast<ColumnVariant &>(*variant_column);
/// If variants are different, we need to extend our variant with new variants.
if (auto * global_discriminators_mapping = combineVariants(dynamic_src.variant_info))
{
variant_col.insertFrom(*dynamic_src.variant_column, n, *global_discriminators_mapping);
return;
}
/// We cannot combine 2 Variant types as total number of variants exceeds the limit.
/// We need to insert single value, try to add only corresponding variant.
const auto & src_variant_col = assert_cast<const ColumnVariant &>(*dynamic_src.variant_column);
auto src_global_discr = src_variant_col.globalDiscriminatorAt(n);
/// NULL doesn't require Variant extension.
if (src_global_discr == ColumnVariant::NULL_DISCRIMINATOR)
{
insertDefault();
return;
}
auto variant_type = assert_cast<const DataTypeVariant &>(*dynamic_src.variant_info.variant_type).getVariants()[src_global_discr];
if (addNewVariant(variant_type))
{
auto discr = variant_info.variant_name_to_discriminator[dynamic_src.variant_info.variant_names[src_global_discr]];
variant_col.insertIntoVariantFrom(discr, src_variant_col.getVariantByGlobalDiscriminator(src_global_discr), src_variant_col.offsetAt(n));
return;
}
/// We reached maximum number of variants and couldn't add new variant.
/// We should always be able to add String variant and cast inserted value to String.
addStringVariant();
auto tmp_variant_column = src_variant_col.getVariantByGlobalDiscriminator(src_global_discr).cloneEmpty();
tmp_variant_column->insertFrom(src_variant_col.getVariantByGlobalDiscriminator(src_global_discr), src_variant_col.offsetAt(n));
auto tmp_string_column = castColumn(ColumnWithTypeAndName(tmp_variant_column->getPtr(), variant_type, ""), std::make_shared<DataTypeString>());
auto string_variant_discr = variant_info.variant_name_to_discriminator["String"];
variant_col.insertIntoVariantFrom(string_variant_discr, *tmp_string_column, 0);
}
void ColumnDynamic::insertRangeFrom(const DB::IColumn & src_, size_t start, size_t length)
{
if (start + length > src_.size())
throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND, "Parameter out of bound in ColumnDynamic::insertRangeFrom method. "
"[start({}) + length({}) > src.size()({})]", start, length, src_.size());
const auto & dynamic_src = assert_cast<const ColumnDynamic &>(src_);
/// Check if we have the same variants in both columns.
if (variant_info.variant_names == dynamic_src.variant_info.variant_names)
{
variant_column->insertRangeFrom(*dynamic_src.variant_column, start, length);
return;
}
auto & variant_col = assert_cast<ColumnVariant &>(*variant_column);
/// If variants are different, we need to extend our variant with new variants.
if (auto * global_discriminators_mapping = combineVariants(dynamic_src.variant_info))
{
variant_col.insertRangeFrom(*dynamic_src.variant_column, start, length, *global_discriminators_mapping);
return;
}
/// We cannot combine 2 Variant types as total number of variants exceeds the limit.
/// In this case we will add most frequent variants from this range and insert them as usual,
/// all other variants will be converted to String.
/// TODO: instead of keeping all current variants and just adding new most frequent variants
/// from source columns we can also try to replace rarest existing variants with frequent
/// variants from source column (so we will avoid casting new frequent variants to String
/// and keeping rare existing ones). It will require rewriting of existing data in Variant
/// column but will improve usability of Dynamic column for example during squashing blocks
/// during insert.
const auto & src_variant_column = dynamic_src.getVariantColumn();
/// Calculate ranges for each variant in current range.
std::vector<std::pair<size_t, size_t>> variants_ranges(dynamic_src.variant_info.variant_names.size(), {0, 0});
/// If we insert the whole column, no need to iterate through the range, we can just take variant sizes.
if (start == 0 && length == dynamic_src.size())
{
for (size_t i = 0; i != dynamic_src.variant_info.variant_names.size(); ++i)
variants_ranges[i] = {0, src_variant_column.getVariantByGlobalDiscriminator(i).size()};
}
/// Otherwise we need to iterate through discriminators and calculate the range for each variant.
else
{
const auto & local_discriminators = src_variant_column.getLocalDiscriminators();
const auto & offsets = src_variant_column.getOffsets();
size_t end = start + length;
for (size_t i = start; i != end; ++i)
{
auto discr = src_variant_column.globalDiscriminatorByLocal(local_discriminators[i]);
if (discr != ColumnVariant::NULL_DISCRIMINATOR)
{
if (!variants_ranges[discr].second)
variants_ranges[discr].first = offsets[i];
++variants_ranges[discr].second;
}
}
}
const auto & src_variants = assert_cast<const DataTypeVariant &>(*dynamic_src.variant_info.variant_type).getVariants();
/// List of variants that will be converted to String.
std::vector<ColumnVariant::Discriminator> variants_to_convert_to_string;
/// Mapping from global discriminators of src_variant to the new variant we will create.
std::vector<ColumnVariant::Discriminator> other_to_new_discriminators;
other_to_new_discriminators.reserve(dynamic_src.variant_info.variant_names.size());
/// Check if we cannot add any more new variants. In this case we will convert all new variants to String.
if (variant_info.variant_names.size() == max_dynamic_types || (variant_info.variant_names.size() == max_dynamic_types - 1 && !variant_info.variant_name_to_discriminator.contains("String")))
{
addStringVariant();
for (size_t i = 0; i != dynamic_src.variant_info.variant_names.size(); ++i)
{
auto it = variant_info.variant_name_to_discriminator.find(dynamic_src.variant_info.variant_names[i]);
if (it == variant_info.variant_name_to_discriminator.end())
{
variants_to_convert_to_string.push_back(i);
other_to_new_discriminators.push_back(variant_info.variant_name_to_discriminator["String"]);
}
else
{
other_to_new_discriminators.push_back(it->second);
}
}
}
/// We still can add some new variants, but not all of them. Let's choose the most frequent variants in specified range.
else
{
std::vector<std::pair<size_t, ColumnVariant::Discriminator>> new_variants_with_sizes;
new_variants_with_sizes.reserve(dynamic_src.variant_info.variant_names.size());
for (size_t i = 0; i != dynamic_src.variant_info.variant_names.size(); ++i)
{
const auto & variant_name = dynamic_src.variant_info.variant_names[i];
if (variant_name != "String" && !variant_info.variant_name_to_discriminator.contains(variant_name))
new_variants_with_sizes.emplace_back(variants_ranges[i].second, i);
}
std::sort(new_variants_with_sizes.begin(), new_variants_with_sizes.end(), std::greater());
DataTypes new_variants = assert_cast<const DataTypeVariant &>(*variant_info.variant_type).getVariants();
if (!variant_info.variant_name_to_discriminator.contains("String"))
new_variants.push_back(std::make_shared<DataTypeString>());
for (const auto & [_, discr] : new_variants_with_sizes)
{
if (new_variants.size() != max_dynamic_types)
new_variants.push_back(src_variants[discr]);
else
variants_to_convert_to_string.push_back(discr);
}
auto new_variant_type = std::make_shared<DataTypeVariant>(new_variants);
updateVariantInfoAndExpandVariantColumn(new_variant_type);
auto string_variant_discriminator = variant_info.variant_name_to_discriminator.at("String");
for (const auto & variant_name : dynamic_src.variant_info.variant_names)
{
auto it = variant_info.variant_name_to_discriminator.find(variant_name);
if (it == variant_info.variant_name_to_discriminator.end())
other_to_new_discriminators.push_back(string_variant_discriminator);
else
other_to_new_discriminators.push_back(it->second);
}
}
/// Convert to String all variants that couldn't be added.
std::unordered_map<ColumnVariant::Discriminator, ColumnPtr> variants_converted_to_string;
variants_converted_to_string.reserve(variants_to_convert_to_string.size());
for (auto discr : variants_to_convert_to_string)
{
auto [variant_start, variant_length] = variants_ranges[discr];
const auto & variant = src_variant_column.getVariantPtrByGlobalDiscriminator(discr);
if (variant_start == 0 && variant_length == variant->size())
variants_converted_to_string[discr] = castColumn(ColumnWithTypeAndName(variant, src_variants[discr], ""), std::make_shared<DataTypeString>());
else
variants_converted_to_string[discr] = castColumn(ColumnWithTypeAndName(variant->cut(variant_start, variant_length), src_variants[discr], ""), std::make_shared<DataTypeString>());
}
const auto & src_local_discriminators = src_variant_column.getLocalDiscriminators();
const auto & src_offsets = src_variant_column.getOffsets();
const auto & src_variant_columns = src_variant_column.getVariants();
size_t end = start + length;
for (size_t i = start; i != end; ++i)
{
auto local_discr = src_local_discriminators[i];
if (local_discr == ColumnVariant::NULL_DISCRIMINATOR)
{
variant_col.insertDefault();
}
else
{
auto global_discr = src_variant_column.globalDiscriminatorByLocal(local_discr);
auto to_global_discr = other_to_new_discriminators[global_discr];
auto it = variants_converted_to_string.find(global_discr);
if (it == variants_converted_to_string.end())
{
variant_col.insertIntoVariantFrom(to_global_discr, *src_variant_columns[local_discr], src_offsets[i]);
}
else
{
variant_col.insertIntoVariantFrom(to_global_discr, *it->second, src_offsets[i] - variants_ranges[global_discr].first);
}
}
}
}
void ColumnDynamic::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length)
{
const auto & dynamic_src = assert_cast<const ColumnDynamic &>(src_);
/// Check if we have the same variants in both columns.
if (variant_info.variant_names == dynamic_src.variant_info.variant_names)
{
variant_column->insertManyFrom(*dynamic_src.variant_column, position, length);
return;
}
auto & variant_col = assert_cast<ColumnVariant &>(*variant_column);
/// If variants are different, we need to extend our variant with new variants.
if (auto * global_discriminators_mapping = combineVariants(dynamic_src.variant_info))
{
variant_col.insertManyFrom(*dynamic_src.variant_column, position, length, *global_discriminators_mapping);
return;
}
/// We cannot combine 2 Variant types as total number of variants exceeds the limit.
/// We need to insert single value, try to add only corresponding variant.
const auto & src_variant_col = assert_cast<const ColumnVariant &>(*dynamic_src.variant_column);
auto src_global_discr = src_variant_col.globalDiscriminatorAt(position);
if (src_global_discr == ColumnVariant::NULL_DISCRIMINATOR)
{
insertDefault();
return;
}
auto variant_type = assert_cast<const DataTypeVariant &>(*dynamic_src.variant_info.variant_type).getVariants()[src_global_discr];
if (addNewVariant(variant_type))
{
auto discr = variant_info.variant_name_to_discriminator[dynamic_src.variant_info.variant_names[src_global_discr]];
variant_col.insertManyIntoVariantFrom(discr, src_variant_col.getVariantByGlobalDiscriminator(src_global_discr), src_variant_col.offsetAt(position), length);
return;
}
addStringVariant();
auto tmp_variant_column = src_variant_col.getVariantByGlobalDiscriminator(src_global_discr).cloneEmpty();
tmp_variant_column->insertFrom(src_variant_col.getVariantByGlobalDiscriminator(src_global_discr), src_variant_col.offsetAt(position));
auto tmp_string_column = castColumn(ColumnWithTypeAndName(tmp_variant_column->getPtr(), variant_type, ""), std::make_shared<DataTypeString>());
auto string_variant_discr = variant_info.variant_name_to_discriminator["String"];
variant_col.insertManyIntoVariantFrom(string_variant_discr, *tmp_string_column, 0, length);
}
StringRef ColumnDynamic::serializeValueIntoArena(size_t n, DB::Arena & arena, const char *& begin) const
{
/// We cannot use Variant serialization here as it serializes discriminator + value,
/// but Dynamic doesn't have fixed mapping discriminator <-> variant type
/// as different Dynamic column can have different Variants.
/// Instead, we serialize null bit + variant type name (size + bytes) + value.
const auto & variant_col = assert_cast<const ColumnVariant &>(*variant_column);
auto discr = variant_col.globalDiscriminatorAt(n);
StringRef res;
UInt8 null_bit = discr == ColumnVariant::NULL_DISCRIMINATOR;
if (null_bit)
{
char * pos = arena.allocContinue(sizeof(UInt8), begin);
memcpy(pos, &null_bit, sizeof(UInt8));
res.data = pos;
res.size = sizeof(UInt8);
return res;
}
const auto & variant_name = variant_info.variant_names[discr];
size_t variant_name_size = variant_name.size();
char * pos = arena.allocContinue(sizeof(UInt8) + sizeof(size_t) + variant_name.size(), begin);
memcpy(pos, &null_bit, sizeof(UInt8));
memcpy(pos + sizeof(UInt8), &variant_name_size, sizeof(size_t));
memcpy(pos + sizeof(UInt8) + sizeof(size_t), variant_name.data(), variant_name.size());
res.data = pos;
res.size = sizeof(UInt8) + sizeof(size_t) + variant_name.size();
auto value_ref = variant_col.getVariantByGlobalDiscriminator(discr).serializeValueIntoArena(variant_col.offsetAt(n), arena, begin);
res.data = value_ref.data - res.size;
res.size += value_ref.size;
return res;
}
const char * ColumnDynamic::deserializeAndInsertFromArena(const char * pos)
{
auto & variant_col = assert_cast<ColumnVariant &>(*variant_column);
UInt8 null_bit = unalignedLoad<UInt8>(pos);
pos += sizeof(UInt8);
if (null_bit)
{
insertDefault();
return pos;
}
/// Read variant type name.
const size_t variant_name_size = unalignedLoad<size_t>(pos);
pos += sizeof(variant_name_size);
String variant_name;
variant_name.resize(variant_name_size);
memcpy(variant_name.data(), pos, variant_name_size);
pos += variant_name_size;
/// If we already have such variant, just deserialize it into corresponding variant column.
auto it = variant_info.variant_name_to_discriminator.find(variant_name);
if (it != variant_info.variant_name_to_discriminator.end())
{
auto discr = it->second;
return variant_col.deserializeVariantAndInsertFromArena(discr, pos);
}
/// If we don't have such variant, add it.
auto variant_type = DataTypeFactory::instance().get(variant_name);
if (likely(addNewVariant(variant_type)))
{
auto discr = variant_info.variant_name_to_discriminator[variant_name];
return variant_col.deserializeVariantAndInsertFromArena(discr, pos);
}
/// We reached maximum number of variants and couldn't add new variant.
/// We should always be able to add String variant and cast inserted value to String.
addStringVariant();
/// Create temporary column of this variant type and deserialize value into it.
auto tmp_variant_column = variant_type->createColumn();
pos = tmp_variant_column->deserializeAndInsertFromArena(pos);
/// Cast temporary column to String and insert this value into String variant.
auto str_column = castColumn(ColumnWithTypeAndName(tmp_variant_column->getPtr(), variant_type, ""), std::make_shared<DataTypeString>());
variant_col.insertIntoVariantFrom(variant_info.variant_name_to_discriminator["String"], *str_column, 0);
return pos;
}
const char * ColumnDynamic::skipSerializedInArena(const char * pos) const
{
UInt8 null_bit = unalignedLoad<UInt8>(pos);
pos += sizeof(UInt8);
if (null_bit)
return pos;
const size_t variant_name_size = unalignedLoad<size_t>(pos);
pos += sizeof(variant_name_size);
String variant_name;
variant_name.resize(variant_name_size);
memcpy(variant_name.data(), pos, variant_name_size);
pos += variant_name_size;
auto tmp_variant_column = DataTypeFactory::instance().get(variant_name)->createColumn();
return tmp_variant_column->skipSerializedInArena(pos);
}
void ColumnDynamic::updateHashWithValue(size_t n, SipHash & hash) const
{
const auto & variant_col = assert_cast<const ColumnVariant &>(*variant_column);
auto discr = variant_col.globalDiscriminatorAt(n);
if (discr == ColumnVariant::NULL_DISCRIMINATOR)
{
hash.update(discr);
return;
}
hash.update(variant_info.variant_names[discr]);
variant_col.getVariantByGlobalDiscriminator(discr).updateHashWithValue(variant_col.offsetAt(n), hash);
}
int ColumnDynamic::compareAt(size_t n, size_t m, const DB::IColumn & rhs, int nan_direction_hint) const
{
const auto & left_variant = assert_cast<const ColumnVariant &>(*variant_column);
const auto & right_dynamic = assert_cast<const ColumnDynamic &>(rhs);
const auto & right_variant = assert_cast<const ColumnVariant &>(*right_dynamic.variant_column);
auto left_discr = left_variant.globalDiscriminatorAt(n);
auto right_discr = right_variant.globalDiscriminatorAt(m);
/// Check if we have NULLs and return result based on nan_direction_hint.
if (left_discr == ColumnVariant::NULL_DISCRIMINATOR && right_discr == ColumnVariant::NULL_DISCRIMINATOR)
return 0;
else if (left_discr == ColumnVariant::NULL_DISCRIMINATOR)
return nan_direction_hint;
else if (right_discr == ColumnVariant::NULL_DISCRIMINATOR)
return -nan_direction_hint;
/// If rows have different types, we compare type names.
if (variant_info.variant_names[left_discr] != right_dynamic.variant_info.variant_names[right_discr])
return variant_info.variant_names[left_discr] < right_dynamic.variant_info.variant_names[right_discr] ? -1 : 1;
/// If rows have the same types, compare actual values from corresponding variants.
return left_variant.getVariantByGlobalDiscriminator(left_discr).compareAt(left_variant.offsetAt(n), right_variant.offsetAt(m), right_variant.getVariantByGlobalDiscriminator(right_discr), nan_direction_hint);
}
ColumnPtr ColumnDynamic::compress() const
{
ColumnPtr variant_compressed = variant_column->compress();
size_t byte_size = variant_compressed->byteSize();
return ColumnCompressed::create(size(), byte_size,
[my_variant_compressed = std::move(variant_compressed), my_variant_info = variant_info, my_max_dynamic_types = max_dynamic_types, my_statistics = statistics]() mutable
{
return ColumnDynamic::create(my_variant_compressed->decompress(), my_variant_info, my_max_dynamic_types, my_statistics);
});
}
void ColumnDynamic::takeDynamicStructureFromSourceColumns(const Columns & source_columns)
{
if (!empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "takeDynamicStructureFromSourceColumns should be called only on empty Dynamic column");
/// During serialization of Dynamic column in MergeTree all Dynamic columns
/// in single part must have the same structure (the same variants). During merge
/// resulting column is constructed by inserting from source columns,
/// but it may happen that resulting column doesn't have rows from all source parts
/// but only from subset of them, and as a result some variants could be missing
/// and structures of resulting column may differ.
/// To solve this problem, before merge we create empty resulting column and use this method
/// to take dynamic structure from all source column even if we won't insert
/// rows from some of them.
/// We want to construct resulting variant with most frequent variants from source columns and convert the rarest
/// variants to single String variant if we exceed the limit of variants.
/// First, collect all variants from all source columns and calculate total sizes.
std::unordered_map<String, size_t> total_sizes;
DataTypes all_variants;
for (const auto & source_column : source_columns)
{
const auto & source_dynamic = assert_cast<const ColumnDynamic &>(*source_column);
const auto & source_variant_column = source_dynamic.getVariantColumn();
const auto & source_variant_info = source_dynamic.getVariantInfo();
const auto & source_variants = assert_cast<const DataTypeVariant &>(*source_variant_info.variant_type).getVariants();
/// During deserialization from MergeTree we will have variant sizes statistics from the whole data part.
const auto & source_statistics = source_dynamic.getStatistics();
for (size_t i = 0; i != source_variants.size(); ++i)
{
const auto & variant_name = source_variant_info.variant_names[i];
auto it = total_sizes.find(variant_name);
/// Add this variant to the list of all variants if we didn't see it yet.
if (it == total_sizes.end())
{
all_variants.push_back(source_variants[i]);
it = total_sizes.emplace(variant_name, 0).first;
}
auto statistics_it = source_statistics.data.find(variant_name);
size_t size = statistics_it == source_statistics.data.end() ? source_variant_column.getVariantByGlobalDiscriminator(i).size() : statistics_it->second;
it->second += size;
}
}
DataTypePtr result_variant_type;
/// Check if the number of all variants exceeds the limit.
if (all_variants.size() > max_dynamic_types || (all_variants.size() == max_dynamic_types && !total_sizes.contains("String")))
{
/// Create list of variants with their sizes and sort it.
std::vector<std::pair<size_t, DataTypePtr>> variants_with_sizes;
variants_with_sizes.reserve(all_variants.size());
for (const auto & variant : all_variants)
variants_with_sizes.emplace_back(total_sizes[variant->getName()], variant);
std::sort(variants_with_sizes.begin(), variants_with_sizes.end(), std::greater());
/// Take first max_dynamic_types variants from sorted list.
DataTypes result_variants;
result_variants.reserve(max_dynamic_types);
/// Add String variant in advance.
result_variants.push_back(std::make_shared<DataTypeString>());
for (const auto & [_, variant] : variants_with_sizes)
{
if (result_variants.size() == max_dynamic_types)
break;
if (variant->getName() != "String")
result_variants.push_back(variant);
}
result_variant_type = std::make_shared<DataTypeVariant>(result_variants);
}
else
{
result_variant_type = std::make_shared<DataTypeVariant>(all_variants);
}
/// Now we have resulting Variant and can fill variant info.
variant_info.variant_type = result_variant_type;
variant_info.variant_name = result_variant_type->getName();
const auto & result_variants = assert_cast<const DataTypeVariant &>(*result_variant_type).getVariants();
variant_info.variant_names.clear();
variant_info.variant_names.reserve(result_variants.size());
variant_info.variant_name_to_discriminator.clear();
variant_info.variant_name_to_discriminator.reserve(result_variants.size());
statistics.data.clear();
statistics.data.reserve(result_variants.size());
statistics.source = Statistics::Source::MERGE;
for (size_t i = 0; i != result_variants.size(); ++i)
{
auto variant_name = result_variants[i]->getName();
variant_info.variant_names.push_back(variant_name);
variant_info.variant_name_to_discriminator[variant_name] = i;
statistics.data[variant_name] = total_sizes[variant_name];
}
variant_column = variant_info.variant_type->createColumn();
/// Now we have the resulting Variant that will be used in all merged columns.
/// Variants can also contain Dynamic columns inside, we should collect
/// all source variants that will be used in the resulting merged column
/// and call takeDynamicStructureFromSourceColumns on all resulting variants.
std::vector<Columns> variants_source_columns;
variants_source_columns.resize(variant_info.variant_names.size());
for (const auto & source_column : source_columns)
{
const auto & source_dynamic_column = assert_cast<const ColumnDynamic &>(*source_column);
const auto & source_variant_info = source_dynamic_column.getVariantInfo();
for (size_t i = 0; i != variant_info.variant_names.size(); ++i)
{
/// Try to find this variant in current source column.
auto it = source_variant_info.variant_name_to_discriminator.find(variant_info.variant_names[i]);
if (it != source_variant_info.variant_name_to_discriminator.end())
variants_source_columns[i].push_back(source_dynamic_column.getVariantColumn().getVariantPtrByGlobalDiscriminator(it->second));
}
}
auto & variant_col = getVariantColumn();
for (size_t i = 0; i != variant_info.variant_names.size(); ++i)
variant_col.getVariantByGlobalDiscriminator(i).takeDynamicStructureFromSourceColumns(variants_source_columns[i]);
}
void ColumnDynamic::applyNullMap(const ColumnVector<UInt8>::Container & null_map)
{
assert_cast<ColumnVariant &>(*variant_column).applyNullMap(null_map);
}
void ColumnDynamic::applyNegatedNullMap(const ColumnVector<UInt8>::Container & null_map)
{
assert_cast<ColumnVariant &>(*variant_column).applyNegatedNullMap(null_map);
}
}

365
src/Columns/ColumnDynamic.h Normal file
View File

@ -0,0 +1,365 @@
#pragma once
#include <Columns/IColumn.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnVariant.h>
#include <DataTypes/IDataType.h>
namespace DB
{
/**
* Column for storing Dynamic type values.
* Dynamic column allows to insert and store values of any data types inside.
* Inside it stores:
* - Variant column with all inserted values of different types.
* - Information about currently stored variants.
*
* When new values are inserted into Dynamic column, the internal Variant
* type and column are extended if the inserted value has new type.
*/
class ColumnDynamic final : public COWHelper<IColumnHelper<ColumnDynamic>, ColumnDynamic>
{
public:
///
struct Statistics
{
enum class Source
{
READ, /// Statistics were loaded into column during reading from MergeTree.
MERGE, /// Statistics were calculated during merge of several MergeTree parts.
};
/// Source of the statistics.
Source source;
/// Statistics data: (variant name) -> (total variant size in data part).
std::unordered_map<String, size_t> data;
};
private:
friend class COWHelper<IColumnHelper<ColumnDynamic>, ColumnDynamic>;
struct VariantInfo
{
DataTypePtr variant_type;
/// Name of the whole variant to not call getName() every time.
String variant_name;
/// Names of variants to not call getName() every time on variants.
Names variant_names;
/// Mapping (variant name) -> (global discriminator).
/// It's used during variant extension.
std::unordered_map<String, UInt8> variant_name_to_discriminator;
};
explicit ColumnDynamic(size_t max_dynamic_types_);
ColumnDynamic(MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {});
public:
/** Create immutable column using immutable arguments. This arguments may be shared with other columns.
* Use IColumn::mutate in order to make mutable column and mutate shared nested columns.
*/
using Base = COWHelper<IColumnHelper<ColumnDynamic>, ColumnDynamic>;
static Ptr create(const ColumnPtr & variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {})
{
return ColumnDynamic::create(variant_column_->assumeMutable(), variant_info_, max_dynamic_types_, statistics_);
}
static MutablePtr create(MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {})
{
return Base::create(std::move(variant_column_), variant_info_, max_dynamic_types_, statistics_);
}
static MutablePtr create(MutableColumnPtr variant_column_, const DataTypePtr & variant_type, size_t max_dynamic_types_, const Statistics & statistics_ = {});
static ColumnPtr create(ColumnPtr variant_column_, const DataTypePtr & variant_type, size_t max_dynamic_types_, const Statistics & statistics_ = {})
{
return create(variant_column_->assumeMutable(), variant_type, max_dynamic_types_, statistics_);
}
static MutablePtr create(size_t max_dynamic_types_)
{
return Base::create(max_dynamic_types_);
}
std::string getName() const override { return "Dynamic(max_types=" + std::to_string(max_dynamic_types) + ")"; }
const char * getFamilyName() const override
{
return "Dynamic";
}
TypeIndex getDataType() const override
{
return TypeIndex::Dynamic;
}
MutableColumnPtr cloneEmpty() const override
{
/// Keep current dynamic structure
return Base::create(variant_column->cloneEmpty(), variant_info, max_dynamic_types, statistics);
}
MutableColumnPtr cloneResized(size_t size) const override
{
return Base::create(variant_column->cloneResized(size), variant_info, max_dynamic_types, statistics);
}
size_t size() const override
{
return variant_column->size();
}
Field operator[](size_t n) const override
{
return (*variant_column)[n];
}
void get(size_t n, Field & res) const override
{
variant_column->get(n, res);
}
bool isDefaultAt(size_t n) const override
{
return variant_column->isDefaultAt(n);
}
bool isNullAt(size_t n) const override
{
return variant_column->isNullAt(n);
}
StringRef getDataAt(size_t n) const override
{
return variant_column->getDataAt(n);
}
void insertData(const char * pos, size_t length) override
{
variant_column->insertData(pos, length);
}
void insert(const Field & x) override;
bool tryInsert(const Field & x) override;
void insertFrom(const IColumn & src_, size_t n) override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
void insertManyFrom(const IColumn & src, size_t position, size_t length) override;
void insertDefault() override
{
variant_column->insertDefault();
}
void insertManyDefaults(size_t length) override
{
variant_column->insertManyDefaults(length);
}
void popBack(size_t n) override
{
variant_column->popBack(n);
}
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override
{
variant_column->updateWeakHash32(hash);
}
void updateHashFast(SipHash & hash) const override
{
variant_column->updateHashFast(hash);
}
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override
{
return create(variant_column->filter(filt, result_size_hint), variant_info, max_dynamic_types);
}
void expand(const Filter & mask, bool inverted) override
{
variant_column->expand(mask, inverted);
}
ColumnPtr permute(const Permutation & perm, size_t limit) const override
{
return create(variant_column->permute(perm, limit), variant_info, max_dynamic_types);
}
ColumnPtr index(const IColumn & indexes, size_t limit) const override
{
return create(variant_column->index(indexes, limit), variant_info, max_dynamic_types);
}
ColumnPtr replicate(const Offsets & replicate_offsets) const override
{
return create(variant_column->replicate(replicate_offsets), variant_info, max_dynamic_types);
}
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override
{
MutableColumns scattered_variant_columns = variant_column->scatter(num_columns, selector);
MutableColumns scattered_columns;
scattered_columns.reserve(num_columns);
for (auto & scattered_variant_column : scattered_variant_columns)
scattered_columns.emplace_back(create(std::move(scattered_variant_column), variant_info, max_dynamic_types));
return scattered_columns;
}
int compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const override;
bool hasEqualValues() const override
{
return variant_column->hasEqualValues();
}
void getExtremes(Field & min, Field & max) const override
{
variant_column->getExtremes(min, max);
}
void getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override
{
variant_column->getPermutation(direction, stability, limit, nan_direction_hint, res);
}
void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override
{
variant_column->updatePermutation(direction, stability, limit, nan_direction_hint, res, equal_ranges);
}
void reserve(size_t n) override
{
variant_column->reserve(n);
}
void ensureOwnership() override
{
variant_column->ensureOwnership();
}
size_t byteSize() const override
{
return variant_column->byteSize();
}
size_t byteSizeAt(size_t n) const override
{
return variant_column->byteSizeAt(n);
}
size_t allocatedBytes() const override
{
return variant_column->allocatedBytes();
}
void protect() override
{
variant_column->protect();
}
void forEachSubcolumn(MutableColumnCallback callback) override
{
callback(variant_column);
}
void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override
{
callback(*variant_column);
variant_column->forEachSubcolumnRecursively(callback);
}
bool structureEquals(const IColumn & rhs) const override
{
if (const auto * rhs_concrete = typeid_cast<const ColumnDynamic *>(&rhs))
return max_dynamic_types == rhs_concrete->max_dynamic_types;
return false;
}
ColumnPtr compress() const override;
double getRatioOfDefaultRows(double sample_ratio) const override
{
return variant_column->getRatioOfDefaultRows(sample_ratio);
}
UInt64 getNumberOfDefaultRows() const override
{
return variant_column->getNumberOfDefaultRows();
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
variant_column->getIndicesOfNonDefaultRows(indices, from, limit);
}
void finalize() override
{
variant_column->finalize();
}
bool isFinalized() const override
{
return variant_column->isFinalized();
}
/// Apply null map to a nested Variant column.
void applyNullMap(const ColumnVector<UInt8>::Container & null_map);
void applyNegatedNullMap(const ColumnVector<UInt8>::Container & null_map);
const VariantInfo & getVariantInfo() const { return variant_info; }
const ColumnPtr & getVariantColumnPtr() const { return variant_column; }
ColumnPtr & getVariantColumnPtr() { return variant_column; }
const ColumnVariant & getVariantColumn() const { return assert_cast<const ColumnVariant &>(*variant_column); }
ColumnVariant & getVariantColumn() { return assert_cast<ColumnVariant &>(*variant_column); }
bool addNewVariant(const DataTypePtr & new_variant);
void addStringVariant();
bool hasDynamicStructure() const override { return true; }
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
const Statistics & getStatistics() const { return statistics; }
size_t getMaxDynamicTypes() const { return max_dynamic_types; }
private:
/// Combine current variant with the other variant and return global discriminators mapping
/// from other variant to the combined one. It's used for inserting from
/// different variants.
/// Returns nullptr if maximum number of variants is reached and the new variant cannot be created.
std::vector<UInt8> * combineVariants(const VariantInfo & other_variant_info);
void updateVariantInfoAndExpandVariantColumn(const DataTypePtr & new_variant_type);
WrappedPtr variant_column;
/// Store the type of current variant with some additional information.
VariantInfo variant_info;
/// The maximum number of different types that can be stored in this Dynamic column.
/// If exceeded, all new variants will be converted to String.
size_t max_dynamic_types;
/// Size statistics of each variants from MergeTree data part.
/// Used in takeDynamicStructureFromSourceColumns and set during deserialization.
Statistics statistics;
/// Cache (Variant name) -> (global discriminators mapping from this variant to current variant in Dynamic column).
/// Used to avoid mappings recalculation in combineVariants for the same Variant types.
std::unordered_map<String, std::vector<UInt8>> variant_mappings_cache;
/// Cache of Variant types that couldn't be combined with current variant in Dynamic column.
/// Used to avoid checking if combination is possible for the same Variant types.
std::unordered_set<String> variants_with_failed_combination;
};
}

View File

@ -312,4 +312,13 @@ ColumnPtr ColumnMap::compress() const
});
}
void ColumnMap::takeDynamicStructureFromSourceColumns(const Columns & source_columns)
{
Columns nested_source_columns;
nested_source_columns.reserve(source_columns.size());
for (const auto & source_column : source_columns)
nested_source_columns.push_back(assert_cast<const ColumnMap &>(*source_column).getNestedColumnPtr());
nested->takeDynamicStructureFromSourceColumns(nested_source_columns);
}
}

View File

@ -104,6 +104,9 @@ public:
ColumnTuple & getNestedData() { return assert_cast<ColumnTuple &>(getNestedColumn().getData()); }
ColumnPtr compress() const override;
bool hasDynamicStructure() const override { return nested->hasDynamicStructure(); }
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
};
}

View File

@ -868,6 +868,15 @@ ColumnPtr ColumnNullable::getNestedColumnWithDefaultOnNull() const
return res;
}
void ColumnNullable::takeDynamicStructureFromSourceColumns(const Columns & source_columns)
{
Columns nested_source_columns;
nested_source_columns.reserve(source_columns.size());
for (const auto & source_column : source_columns)
nested_source_columns.push_back(assert_cast<const ColumnNullable &>(*source_column).getNestedColumnPtr());
nested_column->takeDynamicStructureFromSourceColumns(nested_source_columns);
}
ColumnPtr makeNullable(const ColumnPtr & column)
{
if (isColumnNullable(*column))
@ -924,4 +933,23 @@ ColumnPtr makeNullableOrLowCardinalityNullableSafe(const ColumnPtr & column)
return column;
}
ColumnPtr removeNullable(const ColumnPtr & column)
{
if (const auto * column_nullable = typeid_cast<const ColumnNullable *>(column.get()))
return column_nullable->getNestedColumnPtr();
return column;
}
ColumnPtr removeNullableOrLowCardinalityNullable(const ColumnPtr & column)
{
if (const auto * column_low_cardinality = typeid_cast<const ColumnLowCardinality *>(column.get()))
{
if (!column_low_cardinality->nestedIsNullable())
return column;
return column_low_cardinality->cloneWithDefaultOnNull();
}
return removeNullable(column);
}
}

View File

@ -190,6 +190,9 @@ public:
/// Check that size of null map equals to size of nested column.
void checkConsistency() const;
bool hasDynamicStructure() const override { return nested_column->hasDynamicStructure(); }
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
private:
WrappedPtr nested_column;
WrappedPtr null_map;
@ -211,4 +214,7 @@ ColumnPtr makeNullableSafe(const ColumnPtr & column);
ColumnPtr makeNullableOrLowCardinalityNullable(const ColumnPtr & column);
ColumnPtr makeNullableOrLowCardinalityNullableSafe(const ColumnPtr & column);
ColumnPtr removeNullable(const ColumnPtr & column);
ColumnPtr removeNullableOrLowCardinalityNullable(const ColumnPtr & column);
}

View File

@ -801,6 +801,15 @@ ColumnSparse::Iterator ColumnSparse::getIterator(size_t n) const
return Iterator(offsets_data, _size, current_offset, n);
}
void ColumnSparse::takeDynamicStructureFromSourceColumns(const Columns & source_columns)
{
Columns values_source_columns;
values_source_columns.reserve(source_columns.size());
for (const auto & source_column : source_columns)
values_source_columns.push_back(assert_cast<const ColumnSparse &>(*source_column).getValuesPtr());
values->takeDynamicStructureFromSourceColumns(values_source_columns);
}
ColumnPtr recursiveRemoveSparse(const ColumnPtr & column)
{
if (!column)

View File

@ -148,6 +148,9 @@ public:
size_t sizeOfValueIfFixed() const override { return values->sizeOfValueIfFixed() + values->sizeOfValueIfFixed(); }
bool isCollationSupported() const override { return values->isCollationSupported(); }
bool hasDynamicStructure() const override { return values->hasDynamicStructure(); }
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
size_t getNumberOfTrailingDefaults() const
{
return offsets->empty() ? _size : _size - getOffsetsData().back() - 1;

View File

@ -572,6 +572,34 @@ bool ColumnTuple::isCollationSupported() const
return false;
}
bool ColumnTuple::hasDynamicStructure() const
{
for (const auto & column : columns)
{
if (column->hasDynamicStructure())
return true;
}
return false;
}
void ColumnTuple::takeDynamicStructureFromSourceColumns(const Columns & source_columns)
{
std::vector<Columns> nested_source_columns;
nested_source_columns.resize(columns.size());
for (size_t i = 0; i != columns.size(); ++i)
nested_source_columns[i].reserve(source_columns.size());
for (const auto & source_column : source_columns)
{
const auto & nsource_columns = assert_cast<const ColumnTuple &>(*source_column).getColumns();
for (size_t i = 0; i != nsource_columns.size(); ++i)
nested_source_columns[i].push_back(nsource_columns[i]);
}
for (size_t i = 0; i != columns.size(); ++i)
columns[i]->takeDynamicStructureFromSourceColumns(nested_source_columns[i]);
}
ColumnPtr ColumnTuple::compress() const
{

View File

@ -114,6 +114,9 @@ public:
const ColumnPtr & getColumnPtr(size_t idx) const { return columns[idx]; }
ColumnPtr & getColumnPtr(size_t idx) { return columns[idx]; }
bool hasDynamicStructure() const override;
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
private:
int compareAtImpl(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator * collator=nullptr) const;

View File

@ -12,7 +12,6 @@
#include <Common/Arena.h>
#include <Common/SipHash.h>
#include <Common/HashTable/Hash.h>
#include <DataTypes/Serializations/SerializationInfoTuple.h>
#include <Columns/MaskOperations.h>
@ -452,16 +451,18 @@ bool ColumnVariant::tryInsert(const DB::Field & x)
return false;
}
void ColumnVariant::insertFrom(const IColumn & src_, size_t n)
void ColumnVariant::insertFromImpl(const DB::IColumn & src_, size_t n, const std::vector<ColumnVariant::Discriminator> * global_discriminators_mapping)
{
const size_t num_variants = variants.size();
const ColumnVariant & src = assert_cast<const ColumnVariant &>(src_);
const size_t num_variants = variants.size();
if (src.variants.size() != num_variants)
if (!global_discriminators_mapping && src.variants.size() != num_variants)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types");
/// Remember that src column can have different local variants order.
Discriminator global_discr = src.globalDiscriminatorAt(n);
Discriminator src_global_discr = src.globalDiscriminatorAt(n);
Discriminator global_discr = src_global_discr;
if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR)
global_discr = (*global_discriminators_mapping)[src_global_discr];
Discriminator local_discr = localDiscriminatorByGlobal(global_discr);
getLocalDiscriminators().push_back(local_discr);
if (local_discr == NULL_DISCRIMINATOR)
@ -471,25 +472,15 @@ void ColumnVariant::insertFrom(const IColumn & src_, size_t n)
else
{
getOffsets().push_back(variants[local_discr]->size());
variants[local_discr]->insertFrom(src.getVariantByGlobalDiscriminator(global_discr), src.offsetAt(n));
variants[local_discr]->insertFrom(src.getVariantByGlobalDiscriminator(src_global_discr), src.offsetAt(n));
}
}
void ColumnVariant::insertIntoVariant(const DB::Field & x, Discriminator global_discr)
{
if (global_discr > variants.size())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid global discriminator: {}. The number of variants is {}", size_t(global_discr), variants.size());
auto & variant = getVariantByGlobalDiscriminator(global_discr);
variant.insert(x);
getLocalDiscriminators().push_back(localDiscriminatorByGlobal(global_discr));
getOffsets().push_back(variant.size() - 1);
}
void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length)
void ColumnVariant::insertRangeFromImpl(const DB::IColumn & src_, size_t start, size_t length, const std::vector<ColumnVariant::Discriminator> * global_discriminators_mapping)
{
const size_t num_variants = variants.size();
const auto & src = assert_cast<const ColumnVariant &>(src_);
if (src.variants.size() != num_variants)
if (!global_discriminators_mapping && src.variants.size() != num_variants)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types");
if (start + length > src.getLocalDiscriminators().size())
@ -507,7 +498,12 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l
/// In this case we can simply call insertRangeFrom on this single variant.
if (auto non_empty_src_local_discr = src.getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls())
{
auto local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(*non_empty_src_local_discr));
Discriminator src_global_discr = src.globalDiscriminatorByLocal(*non_empty_src_local_discr);
Discriminator global_discr = src_global_discr;
if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR)
global_discr = (*global_discriminators_mapping)[src_global_discr];
Discriminator local_discr = localDiscriminatorByGlobal(global_discr);
size_t offset = variants[local_discr]->size();
variants[local_discr]->insertRangeFrom(*src.variants[*non_empty_src_local_discr], start, length);
getLocalDiscriminators().resize_fill(local_discriminators->size() + length, local_discr);
@ -522,7 +518,7 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l
/// collect ranges we need to insert for all variants and update offsets.
/// nested_ranges[i].first - offset in src.variants[i]
/// nested_ranges[i].second - length in src.variants[i]
std::vector<std::pair<size_t, size_t>> nested_ranges(num_variants, {0, 0});
std::vector<std::pair<size_t, size_t>> nested_ranges(src.variants.size(), {0, 0});
auto & offsets_data = getOffsets();
offsets_data.reserve(offsets_data.size() + length);
auto & local_discriminators_data = getLocalDiscriminators();
@ -533,7 +529,11 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l
{
/// We insert from src.variants[src_local_discr] to variants[local_discr]
Discriminator src_local_discr = src_local_discriminators_data[i];
Discriminator local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr));
Discriminator src_global_discr = src.globalDiscriminatorByLocal(src_local_discr);
Discriminator global_discr = src_global_discr;
if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR)
global_discr = (*global_discriminators_mapping)[src_global_discr];
Discriminator local_discr = localDiscriminatorByGlobal(global_discr);
local_discriminators_data.push_back(local_discr);
if (local_discr == NULL_DISCRIMINATOR)
{
@ -553,22 +553,29 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l
for (size_t src_local_discr = 0; src_local_discr != nested_ranges.size(); ++src_local_discr)
{
auto [nested_start, nested_length] = nested_ranges[src_local_discr];
auto local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr));
Discriminator src_global_discr = src.globalDiscriminatorByLocal(src_local_discr);
Discriminator global_discr = src_global_discr;
if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR)
global_discr = (*global_discriminators_mapping)[src_global_discr];
Discriminator local_discr = localDiscriminatorByGlobal(global_discr);
if (nested_length)
variants[local_discr]->insertRangeFrom(*src.variants[src_local_discr], nested_start, nested_length);
}
}
void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length)
void ColumnVariant::insertManyFromImpl(const DB::IColumn & src_, size_t position, size_t length, const std::vector<ColumnVariant::Discriminator> * global_discriminators_mapping)
{
const size_t num_variants = variants.size();
const auto & src = assert_cast<const ColumnVariant &>(src_);
if (src.variants.size() != num_variants)
if (!global_discriminators_mapping && src.variants.size() != num_variants)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types");
/// Remember that src column can have different local variants order.
Discriminator src_local_discr = src.localDiscriminatorAt(position);
Discriminator local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr));
Discriminator src_global_discr = src.globalDiscriminatorByLocal(src_local_discr);
Discriminator global_discr = src_global_discr;
if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR)
global_discr = (*global_discriminators_mapping)[src_global_discr];
Discriminator local_discr = localDiscriminatorByGlobal(global_discr);
auto & local_discriminators_data = getLocalDiscriminators();
local_discriminators_data.resize_fill(local_discriminators_data.size() + length, local_discr);
@ -588,6 +595,72 @@ void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, si
}
}
void ColumnVariant::insertFrom(const IColumn & src_, size_t n)
{
insertFromImpl(src_, n, nullptr);
}
void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length)
{
insertRangeFromImpl(src_, start, length, nullptr);
}
void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length)
{
insertManyFromImpl(src_, position, length, nullptr);
}
void ColumnVariant::insertFrom(const DB::IColumn & src_, size_t n, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping)
{
insertFromImpl(src_, n, &global_discriminators_mapping);
}
void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping)
{
insertRangeFromImpl(src_, start, length, &global_discriminators_mapping);
}
void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping)
{
insertManyFromImpl(src_, position, length, &global_discriminators_mapping);
}
void ColumnVariant::insertIntoVariantFrom(DB::ColumnVariant::Discriminator global_discr, const DB::IColumn & src_, size_t n)
{
Discriminator local_discr = localDiscriminatorByGlobal(global_discr);
getLocalDiscriminators().push_back(local_discr);
getOffsets().push_back(variants[local_discr]->size());
variants[local_discr]->insertFrom(src_, n);
}
void ColumnVariant::insertRangeIntoVariantFrom(DB::ColumnVariant::Discriminator global_discr, const DB::IColumn & src_, size_t start, size_t length)
{
Discriminator local_discr = localDiscriminatorByGlobal(global_discr);
auto & local_discriminators_data = getLocalDiscriminators();
local_discriminators_data.resize_fill(local_discriminators_data.size() + length, local_discr);
auto & offsets_data = getOffsets();
size_t offset = variants[local_discr]->size();
offsets_data.reserve(offsets_data.size() + length);
for (size_t i = 0; i != length; ++i)
offsets_data.push_back(offset + i);
variants[local_discr]->insertRangeFrom(src_, start, length);
}
void ColumnVariant::insertManyIntoVariantFrom(DB::ColumnVariant::Discriminator global_discr, const DB::IColumn & src_, size_t position, size_t length)
{
Discriminator local_discr = localDiscriminatorByGlobal(global_discr);
auto & local_discriminators_data = getLocalDiscriminators();
local_discriminators_data.resize_fill(local_discriminators_data.size() + length, local_discr);
auto & offsets_data = getOffsets();
size_t offset = variants[local_discr]->size();
offsets_data.reserve(offsets_data.size() + length);
for (size_t i = 0; i != length; ++i)
offsets_data.push_back(offset + i);
variants[local_discr]->insertManyFrom(src_, position, length);
}
void ColumnVariant::insertDefault()
{
getLocalDiscriminators().push_back(NULL_DISCRIMINATOR);
@ -678,6 +751,14 @@ const char * ColumnVariant::deserializeAndInsertFromArena(const char * pos)
return variants[local_discr]->deserializeAndInsertFromArena(pos);
}
const char * ColumnVariant::deserializeVariantAndInsertFromArena(DB::ColumnVariant::Discriminator global_discr, const char * pos)
{
Discriminator local_discr = localDiscriminatorByGlobal(global_discr);
getLocalDiscriminators().push_back(local_discr);
getOffsets().push_back(variants[local_discr]->size());
return variants[local_discr]->deserializeAndInsertFromArena(pos);
}
const char * ColumnVariant::skipSerializedInArena(const char * pos) const
{
Discriminator global_discr = unalignedLoad<Discriminator>(pos);
@ -1426,4 +1507,54 @@ void ColumnVariant::applyNullMapImpl(const ColumnVector<UInt8>::Container & null
}
}
void ColumnVariant::extend(const std::vector<Discriminator> & old_to_new_global_discriminators, std::vector<std::pair<MutableColumnPtr, Discriminator>> && new_variants_and_discriminators)
{
/// Update global discriminators for current variants.
for (Discriminator & global_discr : local_to_global_discriminators)
global_discr = old_to_new_global_discriminators[global_discr];
/// Add new variants.
variants.reserve(variants.size() + new_variants_and_discriminators.size());
local_to_global_discriminators.reserve(local_to_global_discriminators.size() + new_variants_and_discriminators.size());
for (auto & new_variant_and_discriminator : new_variants_and_discriminators)
{
variants.emplace_back(std::move(new_variant_and_discriminator.first));
local_to_global_discriminators.push_back(new_variant_and_discriminator.second);
}
/// Update global -> local discriminators matching.
global_to_local_discriminators.resize(local_to_global_discriminators.size());
for (Discriminator local_discr = 0; local_discr != local_to_global_discriminators.size(); ++local_discr)
global_to_local_discriminators[local_to_global_discriminators[local_discr]] = local_discr;
}
bool ColumnVariant::hasDynamicStructure() const
{
for (const auto & variant : variants)
{
if (variant->hasDynamicStructure())
return true;
}
return false;
}
void ColumnVariant::takeDynamicStructureFromSourceColumns(const Columns & source_columns)
{
std::vector<Columns> variants_source_columns;
variants_source_columns.resize(variants.size());
for (size_t i = 0; i != variants.size(); ++i)
variants_source_columns[i].reserve(source_columns.size());
for (const auto & source_column : source_columns)
{
const auto & source_variants = assert_cast<const ColumnVariant &>(*source_column).variants;
for (size_t i = 0; i != source_variants.size(); ++i)
variants_source_columns[i].push_back(source_variants[i]);
}
for (size_t i = 0; i != variants.size(); ++i)
variants[i]->takeDynamicStructureFromSourceColumns(variants_source_columns[i]);
}
}

View File

@ -175,18 +175,32 @@ public:
bool isDefaultAt(size_t n) const override;
bool isNullAt(size_t n) const override;
StringRef getDataAt(size_t n) const override;
void insertData(const char * pos, size_t length) override;
void insert(const Field & x) override;
bool tryInsert(const Field & x) override;
void insertIntoVariant(const Field & x, Discriminator global_discr);
void insertFrom(const IColumn & src_, size_t n) override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
void insertManyFrom(const IColumn & src, size_t position, size_t length) override;
void insertRangeFrom(const IColumn & src_, size_t start, size_t length) override;
void insertManyFrom(const IColumn & src_, size_t position, size_t length) override;
/// Methods for insertion from another Variant but with known mapping between global discriminators.
void insertFrom(const IColumn & src_, size_t n, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping);
void insertRangeFrom(const IColumn & src_, size_t start, size_t length, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping);
void insertManyFrom(const IColumn & src_, size_t position, size_t length, const std::vector<ColumnVariant::Discriminator> & global_discriminators_mapping);
/// Methods for insertion into a specific variant.
void insertIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t n);
void insertRangeIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t start, size_t length);
void insertManyIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t position, size_t length);
void insertDefault() override;
void insertManyDefaults(size_t length) override;
void popBack(size_t n) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * deserializeVariantAndInsertFromArena(Discriminator global_discr, const char * pos);
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
@ -234,6 +248,8 @@ public:
ColumnPtr & getVariantPtrByLocalDiscriminator(size_t discr) { return variants[discr]; }
ColumnPtr & getVariantPtrByGlobalDiscriminator(size_t discr) { return variants[global_to_local_discriminators.at(discr)]; }
const NestedColumns & getVariants() const { return variants; }
const IColumn & getLocalDiscriminatorsColumn() const { return *local_discriminators; }
IColumn & getLocalDiscriminatorsColumn() { return *local_discriminators; }
@ -282,7 +298,19 @@ public:
void applyNullMap(const ColumnVector<UInt8>::Container & null_map);
void applyNegatedNullMap(const ColumnVector<UInt8>::Container & null_map);
/// Extend current column with new variants. Change global discriminators of current variants to the new
/// according to the mapping and add new variants with new global discriminators.
/// This extension doesn't rewrite any data, just adds new empty variants and modifies global/local discriminators matching.
void extend(const std::vector<Discriminator> & old_to_new_global_discriminators, std::vector<std::pair<MutableColumnPtr, Discriminator>> && new_variants_and_discriminators);
bool hasDynamicStructure() const override;
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
private:
void insertFromImpl(const IColumn & src_, size_t n, const std::vector<ColumnVariant::Discriminator> * global_discriminators_mapping);
void insertRangeFromImpl(const IColumn & src_, size_t start, size_t length, const std::vector<ColumnVariant::Discriminator> * global_discriminators_mapping);
void insertManyFromImpl(const IColumn & src_, size_t position, size_t length, const std::vector<ColumnVariant::Discriminator> * global_discriminators_mapping);
void initIdentityGlobalToLocalDiscriminatorsMapping();
template <bool inverted>

View File

@ -16,6 +16,7 @@
#include <Columns/ColumnString.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnVariant.h>
#include <Columns/ColumnDynamic.h>
#include <Columns/ColumnVector.h>
#include <Core/Field.h>
#include <DataTypes/Serializations/SerializationInfo.h>
@ -461,6 +462,7 @@ template class IColumnHelper<ColumnAggregateFunction, IColumn>;
template class IColumnHelper<ColumnFunction, IColumn>;
template class IColumnHelper<ColumnCompressed, IColumn>;
template class IColumnHelper<ColumnVariant, IColumn>;
template class IColumnHelper<ColumnDynamic, IColumn>;
template class IColumnHelper<IColumnDummy, IColumn>;

View File

@ -534,6 +534,11 @@ public:
return res;
}
/// Checks if column has dynamic subcolumns.
virtual bool hasDynamicStructure() const { return false; }
/// For columns with dynamic subcolumns this method takes dynamic structure from source columns
/// and creates proper resulting dynamic structure in advance for merge of these source columns.
virtual void takeDynamicStructureFromSourceColumns(const std::vector<Ptr> & /*source_columns*/) {}
/** Some columns can contain another columns inside.
* So, we have a tree of columns. But not all combinations are possible.

View File

@ -0,0 +1,652 @@
#include <Columns/ColumnDynamic.h>
#include <Columns/ColumnString.h>
#include <Common/Arena.h>
#include <gtest/gtest.h>
using namespace DB;
TEST(ColumnDynamic, CreateEmpty)
{
auto column = ColumnDynamic::create(255);
ASSERT_TRUE(column->empty());
ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant()");
ASSERT_TRUE(column->getVariantInfo().variant_names.empty());
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.empty());
}
TEST(ColumnDynamic, InsertDefault)
{
auto column = ColumnDynamic::create(255);
column->insertDefault();
ASSERT_TRUE(column->size() == 1);
ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant()");
ASSERT_TRUE(column->getVariantInfo().variant_names.empty());
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.empty());
ASSERT_TRUE(column->isNullAt(0));
ASSERT_EQ((*column)[0], Field(Null()));
}
TEST(ColumnDynamic, InsertFields)
{
auto column = ColumnDynamic::create(255);
column->insert(Field(42));
column->insert(Field(-42));
column->insert(Field("str1"));
column->insert(Field(Null()));
column->insert(Field(42.42));
column->insert(Field(43));
column->insert(Field(-43));
column->insert(Field("str2"));
column->insert(Field(Null()));
column->insert(Field(43.43));
ASSERT_TRUE(column->size() == 10);
ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant(Float64, Int8, String)");
std::vector<String> expected_names = {"Float64", "Int8", "String"};
ASSERT_EQ(column->getVariantInfo().variant_names, expected_names);
std::unordered_map<String, UInt8> expected_variant_name_to_discriminator = {{"Float64", 0}, {"Int8", 1}, {"String", 2}};
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator);
}
ColumnDynamic::MutablePtr getDynamicWithManyVariants(size_t num_variants, Field tuple_element = Field(42))
{
auto column = ColumnDynamic::create(255);
for (size_t i = 0; i != num_variants; ++i)
{
Tuple tuple;
for (size_t j = 0; j != i + 1; ++j)
tuple.push_back(tuple_element);
column->insert(tuple);
}
return column;
}
TEST(ColumnDynamic, InsertFieldsOverflow1)
{
auto column = getDynamicWithManyVariants(253);
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 253);
column->insert(Field(42.42));
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 254);
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
column->insert(Field(42));
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
Field field = (*column)[column->size() - 1];
ASSERT_EQ(field, "42");
column->insert(Field(43));
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
field = (*column)[column->size() - 1];
ASSERT_EQ(field, "43");
column->insert(Field("str1"));
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
field = (*column)[column->size() - 1];
ASSERT_EQ(field, "str1");
column->insert(Field(Array({Field(42), Field(43)})));
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)"));
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
field = (*column)[column->size() - 1];
ASSERT_EQ(field, "[42, 43]");
}
TEST(ColumnDynamic, InsertFieldsOverflow2)
{
auto column = getDynamicWithManyVariants(254);
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 254);
column->insert(Field("str1"));
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
column->insert(Field(42));
ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String"));
Field field = (*column)[column->size() - 1];
ASSERT_EQ(field, "42");
}
ColumnDynamic::MutablePtr getInsertFromColumn(size_t num = 1)
{
auto column_from = ColumnDynamic::create(255);
for (size_t i = 0; i != num; ++i)
{
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
}
return column_from;
}
void checkInsertFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDynamic::MutablePtr & column_to, const std::string & expected_variant, const std::vector<String> & expected_names, const std::unordered_map<String, UInt8> & expected_variant_name_to_discriminator)
{
column_to->insertFrom(*column_from, 0);
ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant);
ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator);
auto field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42);
column_to->insertFrom(*column_from, 1);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42.42);
column_to->insertFrom(*column_from, 2);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "str");
ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant);
ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator);
}
TEST(ColumnDynamic, InsertFrom1)
{
auto column_to = ColumnDynamic::create(255);
checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}});
}
TEST(ColumnDynamic, InsertFrom2)
{
auto column_to = ColumnDynamic::create(255);
column_to->insert(Field(42));
column_to->insert(Field(42.42));
column_to->insert(Field("str"));
checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}});
}
TEST(ColumnDynamic, InsertFrom3)
{
auto column_to = ColumnDynamic::create(255);
column_to->insert(Field(42));
column_to->insert(Field(42.42));
column_to->insert(Field("str"));
column_to->insert(Array({Field(42)}));
checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Array(Int8), Float64, Int8, String)", {"Array(Int8)", "Float64", "Int8", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"String", 3}});
}
TEST(ColumnDynamic, InsertFromOverflow1)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
auto column_to = getDynamicWithManyVariants(253);
column_to->insertFrom(*column_from, 0);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 254);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
auto field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42);
column_to->insertFrom(*column_from, 1);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "42.42");
column_to->insertFrom(*column_from, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "str");
}
TEST(ColumnDynamic, InsertFromOverflow2)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
auto column_to = getDynamicWithManyVariants(253);
column_to->insertFrom(*column_from, 0);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
auto field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42);
column_to->insertFrom(*column_from, 1);
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "42.42");
}
void checkInsertManyFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDynamic::MutablePtr & column_to, const std::string & expected_variant, const std::vector<String> & expected_names, const std::unordered_map<String, UInt8> & expected_variant_name_to_discriminator)
{
column_to->insertManyFrom(*column_from, 0, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant);
ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator);
auto field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, 42);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42);
column_to->insertManyFrom(*column_from, 1, 2);
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, 42.42);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42.42);
column_to->insertManyFrom(*column_from, 2, 2);
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, "str");
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "str");
ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant);
ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator);
}
TEST(ColumnDynamic, InsertManyFrom1)
{
auto column_to = ColumnDynamic::create(255);
checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}});
}
TEST(ColumnDynamic, InsertManyFrom2)
{
auto column_to = ColumnDynamic::create(255);
column_to->insert(Field(42));
column_to->insert(Field(42.42));
column_to->insert(Field("str"));
checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}});
}
TEST(ColumnDynamic, InsertManyFrom3)
{
auto column_to = ColumnDynamic::create(255);
column_to->insert(Field(42));
column_to->insert(Field(42.42));
column_to->insert(Field("str"));
column_to->insert(Array({Field(42)}));
checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Array(Int8), Float64, Int8, String)", {"Array(Int8)", "Float64", "Int8", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"String", 3}});
}
TEST(ColumnDynamic, InsertManyFromOverflow1)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
auto column_to = getDynamicWithManyVariants(253);
column_to->insertManyFrom(*column_from, 0, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 254);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
auto field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, 42);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42);
column_to->insertManyFrom(*column_from, 1, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, "42.42");
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "42.42");
column_to->insertManyFrom(*column_from, 2, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "str");
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, "str");
}
TEST(ColumnDynamic, InsertManyFromOverflow2)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
auto column_to = getDynamicWithManyVariants(253);
column_to->insertManyFrom(*column_from, 0, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 254);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
auto field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, 42);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, 42);
column_to->insertManyFrom(*column_from, 1, 2);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, "42.42");
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "42.42");
}
void checkInsertRangeFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDynamic::MutablePtr & column_to, const std::string & expected_variant, const std::vector<String> & expected_names, const std::unordered_map<String, UInt8> & expected_variant_name_to_discriminator)
{
column_to->insertRangeFrom(*column_from, 0, 3);
ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant);
ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator);
auto field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, 42);
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, 42.42);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "str");
column_to->insertRangeFrom(*column_from, 3, 3);
field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, 42);
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, 42.42);
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, "str");
ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant);
ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator);
}
TEST(ColumnDynamic, InsertRangeFrom1)
{
auto column_to = ColumnDynamic::create(255);
checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}});
}
TEST(ColumnDynamic, InsertRangeFrom2)
{
auto column_to = ColumnDynamic::create(255);
column_to->insert(Field(42));
column_to->insert(Field(42.42));
column_to->insert(Field("str1"));
checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}});
}
TEST(ColumnDynamic, InsertRangeFrom3)
{
auto column_to = ColumnDynamic::create(255);
column_to->insert(Field(42));
column_to->insert(Field(42.42));
column_to->insert(Field("str1"));
column_to->insert(Array({Field(42)}));
checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Array(Int8), Float64, Int8, String)", {"Array(Int8)", "Float64", "Int8", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"String", 3}});
}
TEST(ColumnDynamic, InsertRangeFromOverflow1)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(43));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
auto column_to = getDynamicWithManyVariants(253);
column_to->insertRangeFrom(*column_from, 0, 4);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
auto field = (*column_to)[column_to->size() - 4];
ASSERT_EQ(field, Field(42));
field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field(43));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field("42.42"));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field("str"));
}
TEST(ColumnDynamic, InsertRangeFromOverflow2)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(43));
column_from->insert(Field(42.42));
auto column_to = getDynamicWithManyVariants(253);
column_to->insertRangeFrom(*column_from, 0, 3);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
auto field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field(42));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field(43));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field("42.42"));
}
TEST(ColumnDynamic, InsertRangeFromOverflow3)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(43));
column_from->insert(Field(42.42));
auto column_to = getDynamicWithManyVariants(253);
column_to->insert(Field("Str"));
column_to->insertRangeFrom(*column_from, 0, 3);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
auto field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field(42));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field(43));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field("42.42"));
}
TEST(ColumnDynamic, InsertRangeFromOverflow4)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
auto column_to = getDynamicWithManyVariants(254);
column_to->insertRangeFrom(*column_from, 0, 3);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
auto field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field("42"));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field("42.42"));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field("str"));
}
TEST(ColumnDynamic, InsertRangeFromOverflow5)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(43));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
auto column_to = getDynamicWithManyVariants(253);
column_to->insert(Field("str"));
column_to->insertRangeFrom(*column_from, 0, 4);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
auto field = (*column_to)[column_to->size() - 4];
ASSERT_EQ(field, Field(42));
field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field(43));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field("42.42"));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field("str"));
}
TEST(ColumnDynamic, InsertRangeFromOverflow6)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(43));
column_from->insert(Field(44));
column_from->insert(Field(42.42));
column_from->insert(Field(43.43));
column_from->insert(Field("str"));
column_from->insert(Field(Array({Field(42)})));
auto column_to = getDynamicWithManyVariants(253);
column_to->insertRangeFrom(*column_from, 2, 5);
ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)"));
auto field = (*column_to)[column_to->size() - 5];
ASSERT_EQ(field, Field("44"));
field = (*column_to)[column_to->size() - 4];
ASSERT_EQ(field, Field(42.42));
field = (*column_to)[column_to->size() - 3];
ASSERT_EQ(field, Field(43.43));
field = (*column_to)[column_to->size() - 2];
ASSERT_EQ(field, Field("str"));
field = (*column_to)[column_to->size() - 1];
ASSERT_EQ(field, Field("[42]"));
}
TEST(ColumnDynamic, SerializeDeserializeFromArena1)
{
auto column = ColumnDynamic::create(255);
column->insert(Field(42));
column->insert(Field(42.42));
column->insert(Field("str"));
column->insert(Field(Null()));
Arena arena;
const char * pos = nullptr;
auto ref1 = column->serializeValueIntoArena(0, arena, pos);
column->serializeValueIntoArena(1, arena, pos);
column->serializeValueIntoArena(2, arena, pos);
column->serializeValueIntoArena(3, arena, pos);
pos = column->deserializeAndInsertFromArena(ref1.data);
pos = column->deserializeAndInsertFromArena(pos);
pos = column->deserializeAndInsertFromArena(pos);
column->deserializeAndInsertFromArena(pos);
ASSERT_EQ((*column)[column->size() - 4], 42);
ASSERT_EQ((*column)[column->size() - 3], 42.42);
ASSERT_EQ((*column)[column->size() - 2], "str");
ASSERT_EQ((*column)[column->size() - 1], Null());
}
TEST(ColumnDynamic, SerializeDeserializeFromArena2)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
column_from->insert(Field(Null()));
Arena arena;
const char * pos = nullptr;
auto ref1 = column_from->serializeValueIntoArena(0, arena, pos);
column_from->serializeValueIntoArena(1, arena, pos);
column_from->serializeValueIntoArena(2, arena, pos);
column_from->serializeValueIntoArena(3, arena, pos);
auto column_to = ColumnDynamic::create(255);
pos = column_to->deserializeAndInsertFromArena(ref1.data);
pos = column_to->deserializeAndInsertFromArena(pos);
pos = column_to->deserializeAndInsertFromArena(pos);
column_to->deserializeAndInsertFromArena(pos);
ASSERT_EQ((*column_from)[column_from->size() - 4], 42);
ASSERT_EQ((*column_from)[column_from->size() - 3], 42.42);
ASSERT_EQ((*column_from)[column_from->size() - 2], "str");
ASSERT_EQ((*column_from)[column_from->size() - 1], Null());
ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), "Variant(Float64, Int8, String)");
std::vector<String> expected_names = {"Float64", "Int8", "String"};
ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names);
std::unordered_map<String, UInt8> expected_variant_name_to_discriminator = {{"Float64", 0}, {"Int8", 1}, {"String", 2}};
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator);
}
TEST(ColumnDynamic, SerializeDeserializeFromArenaOverflow)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
column_from->insert(Field(Null()));
Arena arena;
const char * pos = nullptr;
auto ref1 = column_from->serializeValueIntoArena(0, arena, pos);
column_from->serializeValueIntoArena(1, arena, pos);
column_from->serializeValueIntoArena(2, arena, pos);
column_from->serializeValueIntoArena(3, arena, pos);
auto column_to = getDynamicWithManyVariants(253);
pos = column_to->deserializeAndInsertFromArena(ref1.data);
pos = column_to->deserializeAndInsertFromArena(pos);
pos = column_to->deserializeAndInsertFromArena(pos);
column_to->deserializeAndInsertFromArena(pos);
ASSERT_EQ((*column_from)[column_from->size() - 4], 42);
ASSERT_EQ((*column_from)[column_from->size() - 3], 42.42);
ASSERT_EQ((*column_from)[column_from->size() - 2], "str");
ASSERT_EQ((*column_from)[column_from->size() - 1], Null());
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8"));
ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64"));
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String"));
}
TEST(ColumnDynamic, skipSerializedInArena)
{
auto column_from = ColumnDynamic::create(255);
column_from->insert(Field(42));
column_from->insert(Field(42.42));
column_from->insert(Field("str"));
column_from->insert(Field(Null()));
Arena arena;
const char * pos = nullptr;
auto ref1 = column_from->serializeValueIntoArena(0, arena, pos);
column_from->serializeValueIntoArena(1, arena, pos);
column_from->serializeValueIntoArena(2, arena, pos);
auto ref4 = column_from->serializeValueIntoArena(3, arena, pos);
const char * end = ref4.data + ref4.size;
auto column_to = ColumnDynamic::create(255);
pos = column_to->skipSerializedInArena(ref1.data);
pos = column_to->skipSerializedInArena(pos);
pos = column_to->skipSerializedInArena(pos);
pos = column_to->skipSerializedInArena(pos);
ASSERT_EQ(pos, end);
ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.empty());
ASSERT_TRUE(column_to->getVariantInfo().variant_names.empty());
}

View File

@ -30,6 +30,7 @@ namespace ErrorCodes
extern const int ASYNC_LOAD_CYCLE;
extern const int ASYNC_LOAD_FAILED;
extern const int ASYNC_LOAD_CANCELED;
extern const int ASYNC_LOAD_WAIT_FAILED;
extern const int LOGICAL_ERROR;
}
@ -433,7 +434,7 @@ void AsyncLoader::wait(const LoadJobPtr & job, bool no_throw)
std::unique_lock job_lock{job->mutex};
wait(job_lock, job);
if (!no_throw && job->load_exception)
std::rethrow_exception(job->load_exception);
throw Exception(ErrorCodes::ASYNC_LOAD_WAIT_FAILED, "Waited job failed: {}", getExceptionMessage(job->load_exception, /* with_stacktrace = */ false));
}
void AsyncLoader::remove(const LoadJobSet & jobs)

View File

@ -227,6 +227,8 @@
M(PartsActive, "Active data part, used by current and upcoming SELECTs.") \
M(AttachedDatabase, "Active database, used by current and upcoming SELECTs.") \
M(AttachedTable, "Active table, used by current and upcoming SELECTs.") \
M(AttachedView, "Active view, used by current and upcoming SELECTs.") \
M(AttachedDictionary, "Active dictionary, used by current and upcoming SELECTs.") \
M(PartsOutdated, "Not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes.") \
M(PartsDeleting, "Not active data part with identity refcounter, it is deleting right now by a cleaner.") \
M(PartsDeleteOnDestroy, "Part was moved to another disk and should be deleted in own destructor.") \

View File

@ -600,6 +600,7 @@
M(719, QUERY_CACHE_USED_WITH_SYSTEM_TABLE) \
M(720, USER_EXPIRED) \
M(721, DEPRECATED_FUNCTION) \
M(722, ASYNC_LOAD_WAIT_FAILED) \
\
M(900, DISTRIBUTED_CACHE_ERROR) \
M(901, CANNOT_USE_DISTRIBUTED_CACHE) \

View File

@ -40,6 +40,7 @@ static struct InitFiu
REGULAR(use_delayed_remote_source) \
REGULAR(cluster_discovery_faults) \
REGULAR(replicated_sends_failpoint) \
REGULAR(stripe_log_sink_write_fallpoint)\
ONCE(smt_commit_merge_mutate_zk_fail_after_op) \
ONCE(smt_commit_merge_mutate_zk_fail_before_op) \
ONCE(smt_commit_write_zk_fail_after_op) \
@ -58,6 +59,7 @@ static struct InitFiu
ONCE(execute_query_calling_empty_set_result_func_on_exception) \
ONCE(receive_timeout_on_table_status_response)
namespace FailPoints
{
#define M(NAME) extern const char(NAME)[] = #NAME "";

View File

@ -35,6 +35,7 @@ namespace DB::ErrorCodes
extern const int ASYNC_LOAD_CYCLE;
extern const int ASYNC_LOAD_FAILED;
extern const int ASYNC_LOAD_CANCELED;
extern const int ASYNC_LOAD_WAIT_FAILED;
}
struct Initializer {
@ -262,7 +263,8 @@ TEST(AsyncLoader, CancelPendingJob)
}
catch (Exception & e)
{
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED);
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED);
ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED"));
}
}
@ -288,7 +290,8 @@ TEST(AsyncLoader, CancelPendingTask)
}
catch (Exception & e)
{
ASSERT_TRUE(e.code() == ErrorCodes::ASYNC_LOAD_CANCELED);
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED);
ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED"));
}
try
@ -298,7 +301,8 @@ TEST(AsyncLoader, CancelPendingTask)
}
catch (Exception & e)
{
ASSERT_TRUE(e.code() == ErrorCodes::ASYNC_LOAD_CANCELED);
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED);
ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED"));
}
}
@ -325,7 +329,8 @@ TEST(AsyncLoader, CancelPendingDependency)
}
catch (Exception & e)
{
ASSERT_TRUE(e.code() == ErrorCodes::ASYNC_LOAD_CANCELED);
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED);
ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED"));
}
try
@ -335,7 +340,8 @@ TEST(AsyncLoader, CancelPendingDependency)
}
catch (Exception & e)
{
ASSERT_TRUE(e.code() == ErrorCodes::ASYNC_LOAD_CANCELED);
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED);
ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED"));
}
}
@ -451,8 +457,9 @@ TEST(AsyncLoader, JobFailure)
}
catch (Exception & e)
{
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_FAILED);
ASSERT_TRUE(e.message().find(error_message) != String::npos);
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED);
ASSERT_TRUE(e.message().contains(error_message));
ASSERT_TRUE(e.message().contains("ASYNC_LOAD_FAILED"));
}
}
@ -489,8 +496,9 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies)
}
catch (Exception & e)
{
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED);
ASSERT_TRUE(e.message().find(error_message) != String::npos);
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED);
ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED"));
ASSERT_TRUE(e.message().contains(error_message));
}
try
{
@ -499,8 +507,9 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies)
}
catch (Exception & e)
{
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED);
ASSERT_TRUE(e.message().find(error_message) != String::npos);
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED);
ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED"));
ASSERT_TRUE(e.message().contains(error_message));
}
}
@ -531,7 +540,8 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies)
}
catch (Exception & e)
{
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED);
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED);
ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED"));
}
try
{
@ -540,7 +550,8 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies)
}
catch (Exception & e)
{
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED);
ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED);
ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED"));
}
}

View File

@ -0,0 +1,33 @@
#include <Common/tryGetFileNameByFileDescriptor.h>
#ifdef OS_LINUX
# include <unistd.h>
#elif defined(OS_DARWIN)
# include <fcntl.h>
#endif
#include <fmt/format.h>
namespace DB
{
std::optional<String> tryGetFileNameFromFileDescriptor(int fd)
{
#ifdef OS_LINUX
std::string proc_path = fmt::format("/proc/self/fd/{}", fd);
char file_path[PATH_MAX] = {'\0'};
if (readlink(proc_path.c_str(), file_path, sizeof(file_path) - 1) != -1)
return file_path;
return std::nullopt;
#elif defined(OS_DARWIN)
char file_path[PATH_MAX] = {'\0'};
if (fcntl(fd, F_GETPATH, file_path) != -1)
return file_path;
return std::nullopt;
#else
(void)fd;
return std::nullopt;
#endif
}
}

View File

@ -0,0 +1,10 @@
#pragma once
#include <optional>
#include <base/types.h>
namespace DB
{
/// Supports only Linux/MacOS. On other platforms, returns nullopt.
std::optional<String> tryGetFileNameFromFileDescriptor(int fd);
}

View File

@ -97,6 +97,8 @@ namespace DB
M(UInt64, max_table_size_to_drop, 50000000000lu, "If size of a table is greater than this value (in bytes) than table could not be dropped with any DROP query.", 0) \
M(UInt64, max_partition_size_to_drop, 50000000000lu, "Same as max_table_size_to_drop, but for the partitions.", 0) \
M(UInt64, max_table_num_to_warn, 5000lu, "If number of tables is greater than this value, server will create a warning that will displayed to user.", 0) \
M(UInt64, max_view_num_to_warn, 10000lu, "If number of views is greater than this value, server will create a warning that will displayed to user.", 0) \
M(UInt64, max_dictionary_num_to_warn, 1000lu, "If number of dictionaries is greater than this value, server will create a warning that will displayed to user.", 0) \
M(UInt64, max_database_num_to_warn, 1000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \
M(UInt64, max_part_num_to_warn, 100000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \
M(UInt64, concurrent_threads_soft_limit_num, 0, "Sets how many concurrent thread can be allocated before applying CPU pressure. Zero means unlimited.", 0) \

View File

@ -888,6 +888,7 @@ class IColumn;
M(Bool, traverse_shadow_remote_data_paths, false, "Traverse shadow directory when query system.remote_data_paths", 0) \
M(Bool, geo_distance_returns_float64_on_float64_arguments, true, "If all four arguments to `geoDistance`, `greatCircleDistance`, `greatCircleAngle` functions are Float64, return Float64 and use double precision for internal calculations. In previous ClickHouse versions, the functions always returned Float32.", 0) \
M(Bool, allow_get_client_http_header, false, "Allow to use the function `getClientHTTPHeader` which lets to obtain a value of an the current HTTP request's header. It is not enabled by default for security reasons, because some headers, such as `Cookie`, could contain sensitive info. Note that the `X-ClickHouse-*` and `Authentication` headers are always restricted and cannot be obtained with this function.", 0) \
M(Bool, cast_string_to_dynamic_use_inference, false, "Use types inference during String to Dynamic conversion", 0) \
\
/** Experimental functions */ \
M(Bool, allow_experimental_materialized_postgresql_table, false, "Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental", 0) \
@ -896,6 +897,7 @@ class IColumn;
M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \
M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \
M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \
M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \
M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \
M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \
M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \
@ -1086,6 +1088,7 @@ class IColumn;
M(Bool, input_format_csv_skip_trailing_empty_lines, false, "Skip trailing empty lines in CSV format", 0) \
M(Bool, input_format_tsv_skip_trailing_empty_lines, false, "Skip trailing empty lines in TSV format", 0) \
M(Bool, input_format_custom_skip_trailing_empty_lines, false, "Skip trailing empty lines in CustomSeparated format", 0) \
M(Bool, input_format_tsv_crlf_end_of_line, false, "If it is set true, file function will read TSV format with \\r\\n instead of \\n.", 0) \
\
M(Bool, input_format_native_allow_types_conversion, true, "Allow data types conversion in Native input format", 0) \
\

View File

@ -87,6 +87,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
{
{"24.5", {{"allow_deprecated_functions", true, false, "Allow usage of deprecated functions"},
{"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."},
{"input_format_tsv_crlf_end_of_line", false, false, "Enables reading of CRLF line endings with TSV formats"},
{"output_format_parquet_use_custom_encoder", false, true, "Enable custom Parquet encoder."},
{"cross_join_min_rows_to_compress", 0, 10000000, "A new setting."},
{"cross_join_min_bytes_to_compress", 0, 1_GiB, "A new setting."},
@ -100,6 +101,8 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
{"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"},
{"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"},
{"input_format_force_null_for_omitted_fields", false, false, "Disable type-defaults for omitted fields when needed"},
{"cast_string_to_dynamic_use_inference", false, false, "Add setting to allow converting String to Dynamic through parsing"},
{"allow_experimental_dynamic_type", false, false, "Add new experimental Dynamic type"},
{"azure_max_blocks_in_multipart_upload", 50000, 50000, "Maximum number of blocks in multipart upload for Azure."},
}},
{"24.4", {{"input_format_json_throw_on_bad_escape_sequence", true, true, "Allow to save JSON strings with bad escape sequences"},

View File

@ -50,6 +50,7 @@ enum class TypeIndex : uint8_t
IPv6,
JSONPaths,
Variant,
Dynamic
};
/**

View File

@ -75,6 +75,27 @@ void DataTypeArray::forEachChild(const ChildCallback & callback) const
nested->forEachChild(callback);
}
std::unique_ptr<ISerialization::SubstreamData> DataTypeArray::getDynamicSubcolumnData(std::string_view subcolumn_name, const DB::IDataType::SubstreamData & data, bool throw_if_null) const
{
auto nested_type = assert_cast<const DataTypeArray &>(*data.type).nested;
auto nested_data = std::make_unique<ISerialization::SubstreamData>(nested_type->getDefaultSerialization());
nested_data->type = nested_type;
nested_data->column = data.column ? assert_cast<const ColumnArray &>(*data.column).getDataPtr() : nullptr;
auto nested_subcolumn_data = nested_type->getSubcolumnData(subcolumn_name, *nested_data, throw_if_null);
if (!nested_subcolumn_data)
return nullptr;
auto creator = SerializationArray::SubcolumnCreator(data.column ? assert_cast<const ColumnArray &>(*data.column).getOffsetsPtr() : nullptr);
auto res = std::make_unique<ISerialization::SubstreamData>();
res->serialization = creator.create(nested_subcolumn_data->serialization);
res->type = creator.create(nested_subcolumn_data->type);
if (data.column)
res->column = creator.create(nested_subcolumn_data->column);
return res;
}
static DataTypePtr create(const ASTPtr & arguments)
{
if (!arguments || arguments->children.size() != 1)

View File

@ -55,7 +55,12 @@ public:
bool textCanContainOnlyValidUTF8() const override { return nested->textCanContainOnlyValidUTF8(); }
bool isComparable() const override { return nested->isComparable(); }
bool canBeComparedWithCollation() const override { return nested->canBeComparedWithCollation(); }
bool hasDynamicSubcolumns() const override { return nested->hasDynamicSubcolumns(); }
bool hasDynamicSubcolumnsDeprecated() const override { return nested->hasDynamicSubcolumnsDeprecated(); }
/// Array column doesn't have subcolumns by itself but allows to read subcolumns of nested column.
/// If nested column has dynamic subcolumns, Array of this type should also be able to read these dynamic subcolumns.
bool hasDynamicSubcolumnsData() const override { return nested->hasDynamicSubcolumnsData(); }
std::unique_ptr<SubstreamData> getDynamicSubcolumnData(std::string_view subcolumn_name, const SubstreamData & data, bool throw_if_null) const override;
bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override
{

View File

@ -0,0 +1,144 @@
#include <DataTypes/DataTypeDynamic.h>
#include <DataTypes/Serializations/SerializationDynamic.h>
#include <DataTypes/Serializations/SerializationDynamicElement.h>
#include <DataTypes/Serializations/SerializationVariantElement.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeNullable.h>
#include <Columns/ColumnDynamic.h>
#include <Columns/ColumnVariant.h>
#include <Core/Field.h>
#include <Parsers/IAST.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTLiteral.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int UNEXPECTED_AST_STRUCTURE;
}
DataTypeDynamic::DataTypeDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_)
{
}
MutableColumnPtr DataTypeDynamic::createColumn() const
{
return ColumnDynamic::create(max_dynamic_types);
}
String DataTypeDynamic::doGetName() const
{
if (max_dynamic_types == DEFAULT_MAX_DYNAMIC_TYPES)
return "Dynamic";
return "Dynamic(max_types=" + toString(max_dynamic_types) + ")";
}
Field DataTypeDynamic::getDefault() const
{
return Field(Null());
}
SerializationPtr DataTypeDynamic::doGetDefaultSerialization() const
{
return std::make_shared<SerializationDynamic>(max_dynamic_types);
}
static DataTypePtr create(const ASTPtr & arguments)
{
if (!arguments || arguments->children.empty())
return std::make_shared<DataTypeDynamic>();
if (arguments->children.size() > 1)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Dynamic data type can have only one optional argument - the maximum number of dynamic types in a form 'Dynamic(max_types=N)");
const auto * argument = arguments->children[0]->as<ASTFunction>();
if (!argument || argument->name != "equals")
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Dynamic data type argument should be in a form 'max_types=N'");
auto identifier_name = argument->arguments->children[0]->as<ASTIdentifier>()->name();
if (identifier_name != "max_types")
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected identifier: {}. Dynamic data type argument should be in a form 'max_types=N'", identifier_name);
auto * literal = argument->arguments->children[1]->as<ASTLiteral>();
if (!literal || literal->value.getType() != Field::Types::UInt64 || literal->value.get<UInt64>() == 0 || literal->value.get<UInt64>() > 255)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "'max_types' argument for Dynamic type should be a positive integer between 1 and 255");
return std::make_shared<DataTypeDynamic>(literal->value.get<UInt64>());
}
void registerDataTypeDynamic(DataTypeFactory & factory)
{
factory.registerDataType("Dynamic", create);
}
std::unique_ptr<IDataType::SubstreamData> DataTypeDynamic::getDynamicSubcolumnData(std::string_view subcolumn_name, const DB::IDataType::SubstreamData & data, bool throw_if_null) const
{
auto [subcolumn_type_name, subcolumn_nested_name] = Nested::splitName(subcolumn_name);
/// Check if requested subcolumn is a valid data type.
auto subcolumn_type = DataTypeFactory::instance().tryGet(String(subcolumn_type_name));
if (!subcolumn_type)
{
if (throw_if_null)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Dynamic type doesn't have subcolumn '{}'", subcolumn_type_name);
return nullptr;
}
std::unique_ptr<SubstreamData> res = std::make_unique<SubstreamData>(subcolumn_type->getDefaultSerialization());
res->type = subcolumn_type;
std::optional<ColumnVariant::Discriminator> discriminator;
if (data.column)
{
/// If column was provided, we should extract subcolumn from Dynamic column.
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(*data.column);
const auto & variant_info = dynamic_column.getVariantInfo();
/// Check if provided Dynamic column has subcolumn of this type.
auto it = variant_info.variant_name_to_discriminator.find(subcolumn_type->getName());
if (it != variant_info.variant_name_to_discriminator.end())
{
discriminator = it->second;
res->column = dynamic_column.getVariantColumn().getVariantPtrByGlobalDiscriminator(*discriminator);
}
}
/// Extract nested subcolumn of requested dynamic subcolumn if needed.
if (!subcolumn_nested_name.empty())
{
res = getSubcolumnData(subcolumn_nested_name, *res, throw_if_null);
if (!res)
return nullptr;
}
res->serialization = std::make_shared<SerializationDynamicElement>(res->serialization, subcolumn_type->getName());
res->type = makeNullableOrLowCardinalityNullableSafe(res->type);
if (data.column)
{
if (discriminator)
{
/// Provided Dynamic column has subcolumn of this type, we should use VariantSubcolumnCreator to
/// create full subcolumn from variant according to discriminators.
const auto & variant_column = assert_cast<const ColumnDynamic &>(*data.column).getVariantColumn();
auto creator = SerializationVariantElement::VariantSubcolumnCreator(variant_column.getLocalDiscriminatorsPtr(), "", *discriminator, variant_column.localDiscriminatorByGlobal(*discriminator));
res->column = creator.create(res->column);
}
else
{
/// Provided Dynamic column doesn't have subcolumn of this type, just create column filled with default values.
auto column = res->type->createColumn();
column->insertManyDefaults(data.column->size());
res->column = std::move(column);
}
}
return res;
}
}

View File

@ -0,0 +1,55 @@
#pragma once
#include <DataTypes/IDataType.h>
namespace DB
{
/// Dynamic type allows to store values of any type inside it and to read
/// subcolumns with any type without knowing all of them in advance.
class DataTypeDynamic final : public IDataType
{
public:
static constexpr bool is_parametric = true;
explicit DataTypeDynamic(size_t max_dynamic_types_ = DEFAULT_MAX_DYNAMIC_TYPES);
TypeIndex getTypeId() const override { return TypeIndex::Dynamic; }
const char * getFamilyName() const override { return "Dynamic"; }
bool isParametric() const override { return true; }
bool canBeInsideNullable() const override { return false; }
bool supportsSparseSerialization() const override { return false; }
bool canBeInsideSparseColumns() const override { return false; }
bool isComparable() const override { return true; }
MutableColumnPtr createColumn() const override;
Field getDefault() const override;
/// 2 Dynamic types with different max_dynamic_types parameters are considered as different.
bool equals(const IDataType & rhs) const override
{
if (const auto * rhs_dynamic_type = typeid_cast<const DataTypeDynamic *>(&rhs))
return max_dynamic_types == rhs_dynamic_type->max_dynamic_types;
return false;
}
bool haveSubtypes() const override { return false; }
bool hasDynamicSubcolumnsData() const override { return true; }
std::unique_ptr<SubstreamData> getDynamicSubcolumnData(std::string_view subcolumn_name, const SubstreamData & data, bool throw_if_null) const override;
size_t getMaxDynamicTypes() const { return max_dynamic_types; }
private:
static constexpr size_t DEFAULT_MAX_DYNAMIC_TYPES = 32;
SerializationPtr doGetDefaultSerialization() const override;
String doGetName() const override;
size_t max_dynamic_types;
};
}

View File

@ -292,6 +292,7 @@ DataTypeFactory::DataTypeFactory()
registerDataTypeMap(*this);
registerDataTypeObject(*this);
registerDataTypeVariant(*this);
registerDataTypeDynamic(*this);
}
DataTypeFactory & DataTypeFactory::instance()

View File

@ -100,5 +100,6 @@ void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory);
void registerDataTypeDomainGeo(DataTypeFactory & factory);
void registerDataTypeObject(DataTypeFactory & factory);
void registerDataTypeVariant(DataTypeFactory & factory);
void registerDataTypeDynamic(DataTypeFactory & factory);
}

View File

@ -42,7 +42,7 @@ public:
bool isComparable() const override { return key_type->isComparable() && value_type->isComparable(); }
bool isParametric() const override { return true; }
bool haveSubtypes() const override { return true; }
bool hasDynamicSubcolumns() const override { return nested->hasDynamicSubcolumns(); }
bool hasDynamicSubcolumnsDeprecated() const override { return nested->hasDynamicSubcolumnsDeprecated(); }
const DataTypePtr & getKeyType() const { return key_type; }
const DataTypePtr & getValueType() const { return value_type; }

View File

@ -36,7 +36,7 @@ public:
bool haveSubtypes() const override { return false; }
bool equals(const IDataType & rhs) const override;
bool isParametric() const override { return true; }
bool hasDynamicSubcolumns() const override { return true; }
bool hasDynamicSubcolumnsDeprecated() const override { return true; }
SerializationPtr doGetDefaultSerialization() const override;

View File

@ -291,9 +291,9 @@ bool DataTypeTuple::haveMaximumSizeOfValue() const
return std::all_of(elems.begin(), elems.end(), [](auto && elem) { return elem->haveMaximumSizeOfValue(); });
}
bool DataTypeTuple::hasDynamicSubcolumns() const
bool DataTypeTuple::hasDynamicSubcolumnsDeprecated() const
{
return std::any_of(elems.begin(), elems.end(), [](auto && elem) { return elem->hasDynamicSubcolumns(); });
return std::any_of(elems.begin(), elems.end(), [](auto && elem) { return elem->hasDynamicSubcolumnsDeprecated(); });
}
bool DataTypeTuple::isComparable() const

View File

@ -52,7 +52,7 @@ public:
bool isComparable() const override;
bool textCanContainOnlyValidUTF8() const override;
bool haveMaximumSizeOfValue() const override;
bool hasDynamicSubcolumns() const override;
bool hasDynamicSubcolumnsDeprecated() const override;
size_t getMaximumSizeOfValueInMemory() const override;
size_t getSizeOfValueInMemory() const override;

View File

@ -7,7 +7,6 @@
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/FieldToDataType.h>
#include <Common/assert_cast.h>
#include <IO/WriteHelpers.h>
#include <IO/WriteBufferFromString.h>
#include <IO/Operators.h>
#include <Parsers/IAST.h>
@ -18,7 +17,6 @@ namespace DB
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int EMPTY_DATA_PASSED;
}
@ -33,6 +31,9 @@ DataTypeVariant::DataTypeVariant(const DataTypes & variants_)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Nullable/LowCardinality(Nullable) types are not allowed inside Variant type");
if (type->getTypeId() == TypeIndex::Variant)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Nested Variant types are not allowed");
if (type->getTypeId() == TypeIndex::Dynamic)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dynamic type is not allowed inside Variant type");
/// Don't use Nothing type as a variant.
if (!isNothing(type))
name_to_type[type->getName()] = type;
@ -42,9 +43,6 @@ DataTypeVariant::DataTypeVariant(const DataTypes & variants_)
for (const auto & [_, type] : name_to_type)
variants.push_back(type);
if (variants.empty())
throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Variant cannot be empty");
if (variants.size() > ColumnVariant::MAX_NESTED_COLUMNS)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Variant type with more than {} nested types is not allowed", ColumnVariant::MAX_NESTED_COLUMNS);
}
@ -113,9 +111,16 @@ bool DataTypeVariant::equals(const IDataType & rhs) const
return false;
for (size_t i = 0; i < size; ++i)
{
if (!variants[i]->equals(*rhs_variant.variants[i]))
return false;
/// The same data types with different custom names considered different.
/// For example, UInt8 and Bool.
if ((variants[i]->hasCustomName() || rhs_variant.variants[i]) && variants[i]->getName() != rhs_variant.variants[i]->getName())
return false;
}
return true;
}
@ -129,17 +134,15 @@ bool DataTypeVariant::haveMaximumSizeOfValue() const
return std::all_of(variants.begin(), variants.end(), [](auto && elem) { return elem->haveMaximumSizeOfValue(); });
}
bool DataTypeVariant::hasDynamicSubcolumns() const
bool DataTypeVariant::hasDynamicSubcolumnsDeprecated() const
{
return std::any_of(variants.begin(), variants.end(), [](auto && elem) { return elem->hasDynamicSubcolumns(); });
return std::any_of(variants.begin(), variants.end(), [](auto && elem) { return elem->hasDynamicSubcolumnsDeprecated(); });
}
std::optional<ColumnVariant::Discriminator> DataTypeVariant::tryGetVariantDiscriminator(const IDataType & type) const
std::optional<ColumnVariant::Discriminator> DataTypeVariant::tryGetVariantDiscriminator(const String & type_name) const
{
String type_name = type.getName();
for (size_t i = 0; i != variants.size(); ++i)
{
/// We don't use equals here, because it doesn't respect custom type names.
if (variants[i]->getName() == type_name)
return i;
}
@ -183,7 +186,7 @@ void DataTypeVariant::forEachChild(const DB::IDataType::ChildCallback & callback
static DataTypePtr create(const ASTPtr & arguments)
{
if (!arguments || arguments->children.empty())
throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Variant cannot be empty");
return std::make_shared<DataTypeVariant>(DataTypes{});
DataTypes nested_types;
nested_types.reserve(arguments->children.size());

View File

@ -46,14 +46,14 @@ public:
bool haveSubtypes() const override { return true; }
bool textCanContainOnlyValidUTF8() const override;
bool haveMaximumSizeOfValue() const override;
bool hasDynamicSubcolumns() const override;
bool hasDynamicSubcolumnsDeprecated() const override;
size_t getMaximumSizeOfValueInMemory() const override;
const DataTypePtr & getVariant(size_t i) const { return variants[i]; }
const DataTypes & getVariants() const { return variants; }
/// Check if Variant has provided type in the list of variants and return its discriminator.
std::optional<ColumnVariant::Discriminator> tryGetVariantDiscriminator(const IDataType & type) const;
std::optional<ColumnVariant::Discriminator> tryGetVariantDiscriminator(const String & type_name) const;
void forEachChild(const ChildCallback & callback) const override;

View File

@ -101,14 +101,12 @@ void IDataType::forEachSubcolumn(
data.serialization->enumerateStreams(settings, callback_with_data, data);
}
template <typename Ptr>
Ptr IDataType::getForSubcolumn(
std::unique_ptr<IDataType::SubstreamData> IDataType::getSubcolumnData(
std::string_view subcolumn_name,
const SubstreamData & data,
Ptr SubstreamData::*member,
bool throw_if_null) const
bool throw_if_null)
{
Ptr res;
std::unique_ptr<IDataType::SubstreamData> res;
ISerialization::StreamCallback callback_with_data = [&](const auto & subpath)
{
@ -120,7 +118,29 @@ Ptr IDataType::getForSubcolumn(
auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len);
/// Create data from path only if it's requested subcolumn.
if (name == subcolumn_name)
res = ISerialization::createFromPath(subpath, prefix_len).*member;
{
res = std::make_unique<SubstreamData>(ISerialization::createFromPath(subpath, prefix_len));
}
/// Check if this subcolumn is a prefix of requested subcolumn and it can create dynamic subcolumns.
else if (subcolumn_name.starts_with(name + ".") && subpath[i].data.type && subpath[i].data.type->hasDynamicSubcolumnsData())
{
auto dynamic_subcolumn_name = subcolumn_name.substr(name.size() + 1);
auto dynamic_subcolumn_data = subpath[i].data.type->getDynamicSubcolumnData(dynamic_subcolumn_name, subpath[i].data, false);
if (dynamic_subcolumn_data)
{
/// Create requested subcolumn using dynamic subcolumn data.
auto tmp_subpath = subpath;
if (tmp_subpath[i].creator)
{
dynamic_subcolumn_data->type = tmp_subpath[i].creator->create(dynamic_subcolumn_data->type);
dynamic_subcolumn_data->column = tmp_subpath[i].creator->create(dynamic_subcolumn_data->column);
dynamic_subcolumn_data->serialization = tmp_subpath[i].creator->create(dynamic_subcolumn_data->serialization);
}
tmp_subpath[i].data = *dynamic_subcolumn_data;
res = std::make_unique<SubstreamData>(ISerialization::createFromPath(tmp_subpath, prefix_len));
}
}
}
subpath[i].visited = true;
}
@ -130,8 +150,11 @@ Ptr IDataType::getForSubcolumn(
settings.position_independent_encoding = false;
data.serialization->enumerateStreams(settings, callback_with_data, data);
if (!res && data.type->hasDynamicSubcolumnsData())
return data.type->getDynamicSubcolumnData(subcolumn_name, data, throw_if_null);
if (!res && throw_if_null)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, data.type->getName());
return res;
}
@ -141,34 +164,51 @@ bool IDataType::hasSubcolumn(std::string_view subcolumn_name) const
return tryGetSubcolumnType(subcolumn_name) != nullptr;
}
bool IDataType::hasDynamicSubcolumns() const
{
if (hasDynamicSubcolumnsData())
return true;
bool has_dynamic_subcolumns = false;
auto data = SubstreamData(getDefaultSerialization()).withType(getPtr());
auto callback = [&](const SubstreamPath &, const String &, const SubstreamData & subcolumn_data)
{
has_dynamic_subcolumns |= subcolumn_data.type->hasDynamicSubcolumnsData();
};
forEachSubcolumn(callback, data);
return has_dynamic_subcolumns;
}
DataTypePtr IDataType::tryGetSubcolumnType(std::string_view subcolumn_name) const
{
auto data = SubstreamData(getDefaultSerialization()).withType(getPtr());
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, false);
auto subcolumn_data = getSubcolumnData(subcolumn_name, data, false);
return subcolumn_data ? subcolumn_data->type : nullptr;
}
DataTypePtr IDataType::getSubcolumnType(std::string_view subcolumn_name) const
{
auto data = SubstreamData(getDefaultSerialization()).withType(getPtr());
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, true);
return getSubcolumnData(subcolumn_name, data, true)->type;
}
ColumnPtr IDataType::tryGetSubcolumn(std::string_view subcolumn_name, const ColumnPtr & column) const
{
auto data = SubstreamData(getDefaultSerialization()).withColumn(column);
return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, false);
auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()).withColumn(column);
auto subcolumn_data = getSubcolumnData(subcolumn_name, data, false);
return subcolumn_data ? subcolumn_data->column : nullptr;
}
ColumnPtr IDataType::getSubcolumn(std::string_view subcolumn_name, const ColumnPtr & column) const
{
auto data = SubstreamData(getDefaultSerialization()).withColumn(column);
return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, true);
auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()).withColumn(column);
return getSubcolumnData(subcolumn_name, data, true)->column;
}
SerializationPtr IDataType::getSubcolumnSerialization(std::string_view subcolumn_name, const SerializationPtr & serialization) const
{
auto data = SubstreamData(serialization);
return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization, true);
auto data = SubstreamData(serialization).withType(getPtr());
return getSubcolumnData(subcolumn_name, data, true)->serialization;
}
Names IDataType::getSubcolumnNames() const
@ -323,6 +363,7 @@ bool isMap(TYPE data_type) {return WhichDataType(data_type).isMap(); } \
bool isInterval(TYPE data_type) {return WhichDataType(data_type).isInterval(); } \
bool isObject(TYPE data_type) { return WhichDataType(data_type).isObject(); } \
bool isVariant(TYPE data_type) { return WhichDataType(data_type).isVariant(); } \
bool isDynamic(TYPE data_type) { return WhichDataType(data_type).isDynamic(); } \
bool isNothing(TYPE data_type) { return WhichDataType(data_type).isNothing(); } \
\
bool isColumnedAsNumber(TYPE data_type) \

View File

@ -11,6 +11,12 @@
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
class ReadBuffer;
class WriteBuffer;
@ -311,8 +317,13 @@ public:
/// Strings, Numbers, Date, DateTime, Nullable
virtual bool canBeInsideLowCardinality() const { return false; }
/// Object, Array(Object), Tuple(..., Object, ...)
virtual bool hasDynamicSubcolumns() const { return false; }
/// Checks for deprecated Object type usage recursively: Object, Array(Object), Tuple(..., Object, ...)
virtual bool hasDynamicSubcolumnsDeprecated() const { return false; }
/// Checks if column has dynamic subcolumns.
virtual bool hasDynamicSubcolumns() const;
/// Checks if column can create dynamic subcolumns data and getDynamicSubcolumnData can be called.
virtual bool hasDynamicSubcolumnsData() const { return false; }
/// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column.
static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint);
@ -329,16 +340,25 @@ protected:
mutable SerializationPtr custom_serialization;
public:
bool hasCustomName() const { return static_cast<bool>(custom_name.get()); }
const IDataTypeCustomName * getCustomName() const { return custom_name.get(); }
const ISerialization * getCustomSerialization() const { return custom_serialization.get(); }
private:
template <typename Ptr>
Ptr getForSubcolumn(
protected:
static std::unique_ptr<SubstreamData> getSubcolumnData(
std::string_view subcolumn_name,
const SubstreamData & data,
Ptr SubstreamData::*member,
bool throw_if_null) const;
bool throw_if_null);
virtual std::unique_ptr<SubstreamData> getDynamicSubcolumnData(
std::string_view /*subcolumn_name*/,
const SubstreamData & /*data*/,
bool throw_if_null) const
{
if (throw_if_null)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDynamicSubcolumnData() is not implemented for type {}", getName());
return nullptr;
}
};
@ -423,6 +443,7 @@ struct WhichDataType
constexpr bool isLowCardinality() const { return idx == TypeIndex::LowCardinality; }
constexpr bool isVariant() const { return idx == TypeIndex::Variant; }
constexpr bool isDynamic() const { return idx == TypeIndex::Dynamic; }
};
/// IDataType helpers (alternative for IDataType virtual methods with single point of truth)
@ -483,6 +504,7 @@ bool isMap(TYPE data_type); \
bool isInterval(TYPE data_type); \
bool isObject(TYPE data_type); \
bool isVariant(TYPE data_type); \
bool isDynamic(TYPE data_type); \
bool isNothing(TYPE data_type); \
\
bool isColumnedAsNumber(TYPE data_type); \

View File

@ -177,7 +177,7 @@ static std::pair<ColumnPtr, DataTypePtr> convertObjectColumnToTuple(
static std::pair<ColumnPtr, DataTypePtr> recursivlyConvertDynamicColumnToTuple(
const ColumnPtr & column, const DataTypePtr & type)
{
if (!type->hasDynamicSubcolumns())
if (!type->hasDynamicSubcolumnsDeprecated())
return {column, type};
if (const auto * type_object = typeid_cast<const DataTypeObject *>(type.get()))
@ -243,7 +243,7 @@ void convertDynamicColumnsToTuples(Block & block, const StorageSnapshotPtr & sto
{
for (auto & column : block)
{
if (!column.type->hasDynamicSubcolumns())
if (!column.type->hasDynamicSubcolumnsDeprecated())
continue;
std::tie(column.column, column.type)
@ -417,7 +417,7 @@ static DataTypePtr getLeastCommonTypeForTuple(
static DataTypePtr getLeastCommonTypeForDynamicColumnsImpl(
const DataTypePtr & type_in_storage, const DataTypes & concrete_types, bool check_ambiguos_paths)
{
if (!type_in_storage->hasDynamicSubcolumns())
if (!type_in_storage->hasDynamicSubcolumnsDeprecated())
return type_in_storage;
if (isObject(type_in_storage))
@ -459,7 +459,7 @@ DataTypePtr getLeastCommonTypeForDynamicColumns(
DataTypePtr createConcreteEmptyDynamicColumn(const DataTypePtr & type_in_storage)
{
if (!type_in_storage->hasDynamicSubcolumns())
if (!type_in_storage->hasDynamicSubcolumnsDeprecated())
return type_in_storage;
if (isObject(type_in_storage))
@ -494,7 +494,7 @@ bool hasDynamicSubcolumns(const ColumnsDescription & columns)
return std::any_of(columns.begin(), columns.end(),
[](const auto & column)
{
return column.type->hasDynamicSubcolumns();
return column.type->hasDynamicSubcolumnsDeprecated();
});
}
@ -1065,7 +1065,7 @@ Field FieldVisitorFoldDimension::operator()(const Null & x) const
void setAllObjectsToDummyTupleType(NamesAndTypesList & columns)
{
for (auto & column : columns)
if (column.type->hasDynamicSubcolumns())
if (column.type->hasDynamicSubcolumnsDeprecated())
column.type = createConcreteEmptyDynamicColumn(column.type);
}

View File

@ -194,7 +194,7 @@ ColumnsDescription getConcreteObjectColumns(
/// dummy column will be removed.
for (const auto & column : storage_columns)
{
if (column.type->hasDynamicSubcolumns())
if (column.type->hasDynamicSubcolumnsDeprecated())
types_in_entries[column.name].push_back(createConcreteEmptyDynamicColumn(column.type));
}
@ -204,7 +204,7 @@ ColumnsDescription getConcreteObjectColumns(
for (const auto & column : entry_columns)
{
auto storage_column = storage_columns.tryGetPhysical(column.name);
if (storage_column && storage_column->type->hasDynamicSubcolumns())
if (storage_column && storage_column->type->hasDynamicSubcolumnsDeprecated())
types_in_entries[column.name].push_back(column.type);
}
}

View File

@ -196,6 +196,8 @@ String getNameForSubstreamPath(
stream_name += ".variant_offsets";
else if (it->type == Substream::VariantElement)
stream_name += "." + it->variant_element_name;
else if (it->type == SubstreamType::DynamicStructure)
stream_name += ".dynamic_structure";
}
return stream_name;
@ -271,6 +273,23 @@ ColumnPtr ISerialization::getFromSubstreamsCache(SubstreamsCache * cache, const
return it == cache->end() ? nullptr : it->second;
}
void ISerialization::addToSubstreamsDeserializeStatesCache(SubstreamsDeserializeStatesCache * cache, const SubstreamPath & path, DeserializeBinaryBulkStatePtr state)
{
if (!cache || path.empty())
return;
cache->emplace(getSubcolumnNameForStream(path), state);
}
ISerialization::DeserializeBinaryBulkStatePtr ISerialization::getFromSubstreamsDeserializeStatesCache(SubstreamsDeserializeStatesCache * cache, const SubstreamPath & path)
{
if (!cache || path.empty())
return nullptr;
auto it = cache->find(getSubcolumnNameForStream(path));
return it == cache->end() ? nullptr : it->second;
}
bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path)
{
for (const auto & elem : path)

View File

@ -99,6 +99,19 @@ public:
using SubcolumnCreatorPtr = std::shared_ptr<const ISubcolumnCreator>;
struct SerializeBinaryBulkState
{
virtual ~SerializeBinaryBulkState() = default;
};
struct DeserializeBinaryBulkState
{
virtual ~DeserializeBinaryBulkState() = default;
};
using SerializeBinaryBulkStatePtr = std::shared_ptr<SerializeBinaryBulkState>;
using DeserializeBinaryBulkStatePtr = std::shared_ptr<DeserializeBinaryBulkState>;
struct SubstreamData
{
SubstreamData() = default;
@ -125,10 +138,22 @@ public:
return *this;
}
SubstreamData & withDeserializeState(DeserializeBinaryBulkStatePtr deserialize_state_)
{
deserialize_state = std::move(deserialize_state_);
return *this;
}
SerializationPtr serialization;
DataTypePtr type;
ColumnPtr column;
SerializationInfoPtr serialization_info;
/// For types with dynamic subcolumns deserialize state contains information
/// about current dynamic structure. And this information can be useful
/// when we call enumerateStreams after deserializeBinaryBulkStatePrefix
/// to enumerate dynamic streams.
DeserializeBinaryBulkStatePtr deserialize_state;
};
struct Substream
@ -160,6 +185,9 @@ public:
VariantElements,
VariantElement,
DynamicData,
DynamicStructure,
Regular,
};
@ -218,19 +246,6 @@ public:
using OutputStreamGetter = std::function<WriteBuffer*(const SubstreamPath &)>;
using InputStreamGetter = std::function<ReadBuffer*(const SubstreamPath &)>;
struct SerializeBinaryBulkState
{
virtual ~SerializeBinaryBulkState() = default;
};
struct DeserializeBinaryBulkState
{
virtual ~DeserializeBinaryBulkState() = default;
};
using SerializeBinaryBulkStatePtr = std::shared_ptr<SerializeBinaryBulkState>;
using DeserializeBinaryBulkStatePtr = std::shared_ptr<DeserializeBinaryBulkState>;
struct SerializeBinaryBulkSettings
{
OutputStreamGetter getter;
@ -240,6 +255,14 @@ public:
bool low_cardinality_use_single_dictionary_for_part = true;
bool position_independent_encoding = true;
enum class DynamicStatisticsMode
{
NONE, /// Don't write statistics.
PREFIX, /// Write statistics in prefix.
SUFFIX, /// Write statistics in suffix.
};
DynamicStatisticsMode dynamic_write_statistics = DynamicStatisticsMode::NONE;
};
struct DeserializeBinaryBulkSettings
@ -256,6 +279,8 @@ public:
/// If not zero, may be used to avoid reallocations while reading column of String type.
double avg_value_size_hint = 0;
bool dynamic_read_statistics = false;
};
/// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark.
@ -270,10 +295,13 @@ public:
SerializeBinaryBulkSettings & /*settings*/,
SerializeBinaryBulkStatePtr & /*state*/) const {}
using SubstreamsDeserializeStatesCache = std::unordered_map<String, DeserializeBinaryBulkStatePtr>;
/// Call before before deserializeBinaryBulkWithMultipleStreams chain to get DeserializeBinaryBulkStatePtr.
virtual void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & /*settings*/,
DeserializeBinaryBulkStatePtr & /*state*/) const {}
DeserializeBinaryBulkStatePtr & /*state*/,
SubstreamsDeserializeStatesCache * /*cache*/) const {}
/** 'offset' and 'limit' are used to specify range.
* limit = 0 - means no limit.
@ -393,6 +421,9 @@ public:
static void addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column);
static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path);
static void addToSubstreamsDeserializeStatesCache(SubstreamsDeserializeStatesCache * cache, const SubstreamPath & path, DeserializeBinaryBulkStatePtr state);
static DeserializeBinaryBulkStatePtr getFromSubstreamsDeserializeStatesCache(SubstreamsDeserializeStatesCache * cache, const SubstreamPath & path);
static bool isSpecialCompressionAllowed(const SubstreamPath & path);
static size_t getArrayLevel(const SubstreamPath & path);

View File

@ -146,10 +146,10 @@ void SerializationAggregateFunction::serializeTextEscaped(const IColumn & column
}
void SerializationAggregateFunction::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
void SerializationAggregateFunction::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String s;
readEscapedString(s, istr);
settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(s, istr) : readEscapedString(s, istr);
deserializeFromString(function, column, s, version);
}

View File

@ -254,7 +254,8 @@ void SerializationArray::enumerateStreams(
auto next_data = SubstreamData(nested)
.withType(type_array ? type_array->getNestedType() : nullptr)
.withColumn(column_array ? column_array->getDataPtr() : nullptr)
.withSerializationInfo(data.serialization_info);
.withSerializationInfo(data.serialization_info)
.withDeserializeState(data.deserialize_state);
nested->enumerateStreams(settings, callback, next_data);
settings.path.pop_back();
@ -284,10 +285,11 @@ void SerializationArray::serializeBinaryBulkStateSuffix(
void SerializationArray::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const
{
settings.path.push_back(Substream::ArrayElements);
nested->deserializeBinaryBulkStatePrefix(settings, state);
nested->deserializeBinaryBulkStatePrefix(settings, state, cache);
settings.path.pop_back();
}

View File

@ -55,7 +55,8 @@ public:
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
@ -71,7 +72,6 @@ public:
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
private:
struct SubcolumnCreator : public ISubcolumnCreator
{
const ColumnPtr offsets;

View File

@ -242,8 +242,10 @@ void SerializationBool::deserializeTextEscaped(IColumn & column, ReadBuffer & is
{
if (istr.eof())
throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF.");
deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; });
if (settings.tsv.crlf_end_of_line_input)
deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n' || *buf.position() == '\r'; });
else
deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; });
}
bool SerializationBool::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const

View File

@ -75,7 +75,7 @@ void SerializationCustomSimpleText::serializeTextEscaped(const IColumn & column,
void SerializationCustomSimpleText::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String str;
readEscapedString(str, istr);
settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(str, istr) : readEscapedString(str, istr);
deserializeFromString(*this, column, str, settings);
}

View File

@ -0,0 +1,644 @@
#include <DataTypes/Serializations/SerializationDynamic.h>
#include <DataTypes/Serializations/SerializationVariant.h>
#include <DataTypes/FieldToDataType.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/DataTypeString.h>
#include <Columns/ColumnDynamic.h>
#include <Columns/ColumnString.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <IO/ReadBufferFromString.h>
#include <Interpreters/castColumn.h>
#include <Formats/EscapingRuleUtils.h>
namespace DB
{
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
extern const int LOGICAL_ERROR;
}
struct SerializeBinaryBulkStateDynamic : public ISerialization::SerializeBinaryBulkState
{
SerializationDynamic::DynamicStructureSerializationVersion structure_version;
DataTypePtr variant_type;
Names variant_names;
SerializationPtr variant_serialization;
ISerialization::SerializeBinaryBulkStatePtr variant_state;
/// Variants statistics. Map (Variant name) -> (Variant size).
ColumnDynamic::Statistics statistics = { .source = ColumnDynamic::Statistics::Source::READ, .data = {} };
explicit SerializeBinaryBulkStateDynamic(UInt64 structure_version_) : structure_version(structure_version_) {}
};
struct DeserializeBinaryBulkStateDynamic : public ISerialization::DeserializeBinaryBulkState
{
SerializationPtr variant_serialization;
ISerialization::DeserializeBinaryBulkStatePtr variant_state;
ISerialization::DeserializeBinaryBulkStatePtr structure_state;
};
void SerializationDynamic::enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const
{
settings.path.push_back(Substream::DynamicStructure);
callback(settings.path);
settings.path.pop_back();
const auto * column_dynamic = data.column ? &assert_cast<const ColumnDynamic &>(*data.column) : nullptr;
const auto * deserialize_state = data.deserialize_state ? checkAndGetState<DeserializeBinaryBulkStateDynamic>(data.deserialize_state) : nullptr;
/// If column is nullptr and we don't have deserialize state yet, nothing to enumerate as we don't have any variants.
if (!column_dynamic && !deserialize_state)
return;
const auto & variant_type = column_dynamic ? column_dynamic->getVariantInfo().variant_type : checkAndGetState<DeserializeBinaryBulkStateDynamicStructure>(deserialize_state->structure_state)->variant_type;
auto variant_serialization = variant_type->getDefaultSerialization();
settings.path.push_back(Substream::DynamicData);
auto variant_data = SubstreamData(variant_serialization)
.withType(variant_type)
.withColumn(column_dynamic ? column_dynamic->getVariantColumnPtr() : nullptr)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state ? deserialize_state->variant_state : nullptr);
settings.path.back().data = variant_data;
variant_serialization->enumerateStreams(settings, callback, variant_data);
settings.path.pop_back();
}
SerializationDynamic::DynamicStructureSerializationVersion::DynamicStructureSerializationVersion(UInt64 version) : value(static_cast<Value>(version))
{
checkVersion(version);
}
void SerializationDynamic::DynamicStructureSerializationVersion::checkVersion(UInt64 version)
{
if (version != VariantTypeName)
throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid version for Dynamic structure serialization.");
}
void SerializationDynamic::serializeBinaryBulkStatePrefix(
const DB::IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
const auto & column_dynamic = assert_cast<const ColumnDynamic &>(column);
const auto & variant_info = column_dynamic.getVariantInfo();
settings.path.push_back(Substream::DynamicStructure);
auto * stream = settings.getter(settings.path);
settings.path.pop_back();
if (!stream)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for Dynamic column structure during serialization of binary bulk state prefix");
/// Write structure serialization version.
UInt64 structure_version = DynamicStructureSerializationVersion::Value::VariantTypeName;
writeBinaryLittleEndian(structure_version, *stream);
auto dynamic_state = std::make_shared<SerializeBinaryBulkStateDynamic>(structure_version);
dynamic_state->variant_type = variant_info.variant_type;
dynamic_state->variant_names = variant_info.variant_names;
const auto & variant_column = column_dynamic.getVariantColumn();
/// Write internal Variant type name.
writeStringBinary(dynamic_state->variant_type->getName(), *stream);
/// Write statistics in prefix if needed.
if (settings.dynamic_write_statistics == SerializeBinaryBulkSettings::DynamicStatisticsMode::PREFIX)
{
const auto & statistics = column_dynamic.getStatistics();
for (size_t i = 0; i != variant_info.variant_names.size(); ++i)
{
size_t size = 0;
/// Check if we can use statistics stored in the column. There are 2 possible sources
/// of this statistics:
/// - statistics calculated during merge of some data parts (Statistics::Source::MERGE)
/// - statistics read from the data part during deserialization of Dynamic column (Statistics::Source::READ).
/// We can rely only on statistics calculated during the merge, because column with statistics that was read
/// during deserialization from some data part could be filtered/limited/transformed/etc and so the statistics can be outdated.
if (!statistics.data.empty() && statistics.source == ColumnDynamic::Statistics::Source::MERGE)
size = statistics.data.at(variant_info.variant_names[i]);
/// Otherwise we can use only variant sizes from current column.
else
size = variant_column.getVariantByGlobalDiscriminator(i).size();
writeVarUInt(size, *stream);
}
}
dynamic_state->variant_serialization = dynamic_state->variant_type->getDefaultSerialization();
settings.path.push_back(Substream::DynamicData);
dynamic_state->variant_serialization->serializeBinaryBulkStatePrefix(variant_column, settings, dynamic_state->variant_state);
settings.path.pop_back();
state = std::move(dynamic_state);
}
void SerializationDynamic::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const
{
DeserializeBinaryBulkStatePtr structure_state = deserializeDynamicStructureStatePrefix(settings, cache);
if (!structure_state)
return;
auto dynamic_state = std::make_shared<DeserializeBinaryBulkStateDynamic>();
dynamic_state->structure_state = structure_state;
dynamic_state->variant_serialization = checkAndGetState<DeserializeBinaryBulkStateDynamicStructure>(structure_state)->variant_type->getDefaultSerialization();
settings.path.push_back(Substream::DynamicData);
dynamic_state->variant_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_state->variant_state, cache);
settings.path.pop_back();
state = std::move(dynamic_state);
}
ISerialization::DeserializeBinaryBulkStatePtr SerializationDynamic::deserializeDynamicStructureStatePrefix(
DeserializeBinaryBulkSettings & settings, SubstreamsDeserializeStatesCache * cache)
{
settings.path.push_back(Substream::DynamicStructure);
DeserializeBinaryBulkStatePtr state = nullptr;
if (auto cached_state = getFromSubstreamsDeserializeStatesCache(cache, settings.path))
{
state = cached_state;
}
else if (auto * structure_stream = settings.getter(settings.path))
{
/// Read structure serialization version.
UInt64 structure_version;
readBinaryLittleEndian(structure_version, *structure_stream);
auto structure_state = std::make_shared<DeserializeBinaryBulkStateDynamicStructure>(structure_version);
/// Read internal Variant type name.
String data_type_name;
readStringBinary(data_type_name, *structure_stream);
structure_state->variant_type = DataTypeFactory::instance().get(data_type_name);
const auto * variant_type = typeid_cast<const DataTypeVariant *>(structure_state->variant_type.get());
if (!variant_type)
throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect type of Dynamic nested column, expected Variant, got {}", structure_state->variant_type->getName());
/// Read statistics.
if (settings.dynamic_read_statistics)
{
const auto & variants = variant_type->getVariants();
size_t variant_size;
for (const auto & variant : variants)
{
readVarUInt(variant_size, *structure_stream);
structure_state->statistics.data[variant->getName()] = variant_size;
}
}
state = structure_state;
addToSubstreamsDeserializeStatesCache(cache, settings.path, state);
}
settings.path.pop_back();
return state;
}
void SerializationDynamic::serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const
{
auto * dynamic_state = checkAndGetState<SerializeBinaryBulkStateDynamic>(state);
settings.path.push_back(Substream::DynamicStructure);
auto * stream = settings.getter(settings.path);
settings.path.pop_back();
if (!stream)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for Dynamic column structure during serialization of binary bulk state prefix");
/// Write statistics in suffix if needed.
if (settings.dynamic_write_statistics == SerializeBinaryBulkSettings::DynamicStatisticsMode::SUFFIX)
{
for (const auto & variant_name : dynamic_state->variant_names)
writeVarUInt(dynamic_state->statistics.data[variant_name], *stream);
}
settings.path.push_back(Substream::DynamicData);
dynamic_state->variant_serialization->serializeBinaryBulkStateSuffix(settings, dynamic_state->variant_state);
settings.path.pop_back();
}
void SerializationDynamic::serializeBinaryBulkWithMultipleStreams(
const DB::IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
const auto & column_dynamic = assert_cast<const ColumnDynamic &>(column);
auto * dynamic_state = checkAndGetState<SerializeBinaryBulkStateDynamic>(state);
const auto & variant_info = column_dynamic.getVariantInfo();
const auto * variant_column = &column_dynamic.getVariantColumn();
if (!variant_info.variant_type->equals(*dynamic_state->variant_type))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mismatch of internal columns of Dynamic. Expected: {}, Got: {}", dynamic_state->variant_type->getName(), variant_info.variant_type->getName());
settings.path.push_back(Substream::DynamicData);
assert_cast<const SerializationVariant &>(*dynamic_state->variant_serialization)
.serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(*variant_column, offset, limit, settings, dynamic_state->variant_state, dynamic_state->statistics.data);
settings.path.pop_back();
}
void SerializationDynamic::deserializeBinaryBulkWithMultipleStreams(
DB::ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
if (!state)
return;
auto mutable_column = column->assumeMutable();
auto * dynamic_state = checkAndGetState<DeserializeBinaryBulkStateDynamic>(state);
auto * structure_state = checkAndGetState<DeserializeBinaryBulkStateDynamicStructure>(dynamic_state->structure_state);
if (mutable_column->empty())
mutable_column = ColumnDynamic::create(structure_state->variant_type->createColumn(), structure_state->variant_type, max_dynamic_types, structure_state->statistics);
auto & column_dynamic = assert_cast<ColumnDynamic &>(*mutable_column);
const auto & variant_info = column_dynamic.getVariantInfo();
if (!variant_info.variant_type->equals(*structure_state->variant_type))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mismatch of internal columns of Dynamic. Expected: {}, Got: {}", structure_state->variant_type->getName(), variant_info.variant_type->getName());
settings.path.push_back(Substream::DynamicData);
dynamic_state->variant_serialization->deserializeBinaryBulkWithMultipleStreams(column_dynamic.getVariantColumnPtr(), limit, settings, dynamic_state->variant_state, cache);
settings.path.pop_back();
column = std::move(mutable_column);
}
void SerializationDynamic::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const
{
UInt8 null_bit = field.isNull();
writeBinary(null_bit, ostr);
if (null_bit)
return;
auto field_type = applyVisitor(FieldToDataType(), field);
auto field_type_name = field_type->getName();
writeVarUInt(field_type_name.size(), ostr);
writeString(field_type_name, ostr);
field_type->getDefaultSerialization()->serializeBinary(field, ostr, settings);
}
void SerializationDynamic::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const
{
UInt8 null_bit;
readBinary(null_bit, istr);
if (null_bit)
{
field = Null();
return;
}
size_t field_type_name_size;
readVarUInt(field_type_name_size, istr);
String field_type_name(field_type_name_size, 0);
istr.readStrict(field_type_name.data(), field_type_name_size);
auto field_type = DataTypeFactory::instance().get(field_type_name);
field_type->getDefaultSerialization()->deserializeBinary(field, istr, settings);
}
void SerializationDynamic::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(column);
const auto & variant_info = dynamic_column.getVariantInfo();
const auto & variant_column = dynamic_column.getVariantColumn();
auto global_discr = variant_column.globalDiscriminatorAt(row_num);
UInt8 null_bit = global_discr == ColumnVariant::NULL_DISCRIMINATOR;
writeBinary(null_bit, ostr);
if (null_bit)
return;
const auto & variant_type = assert_cast<const DataTypeVariant &>(*variant_info.variant_type).getVariant(global_discr);
const auto & variant_type_name = variant_info.variant_names[global_discr];
writeVarUInt(variant_type_name.size(), ostr);
writeString(variant_type_name, ostr);
variant_type->getDefaultSerialization()->serializeBinary(variant_column.getVariantByGlobalDiscriminator(global_discr), variant_column.offsetAt(row_num), ostr, settings);
}
template <typename DeserializeFunc>
static void deserializeVariant(
ColumnVariant & variant_column,
const DataTypePtr & variant_type,
ColumnVariant::Discriminator global_discr,
ReadBuffer & istr,
DeserializeFunc deserialize)
{
auto & variant = variant_column.getVariantByGlobalDiscriminator(global_discr);
deserialize(*variant_type->getDefaultSerialization(), variant, istr);
variant_column.getLocalDiscriminators().push_back(variant_column.localDiscriminatorByGlobal(global_discr));
variant_column.getOffsets().push_back(variant.size() - 1);
}
void SerializationDynamic::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
auto & dynamic_column = assert_cast<ColumnDynamic &>(column);
UInt8 null_bit;
readBinary(null_bit, istr);
if (null_bit)
{
dynamic_column.insertDefault();
return;
}
size_t variant_type_name_size;
readVarUInt(variant_type_name_size, istr);
String variant_type_name(variant_type_name_size, 0);
istr.readStrict(variant_type_name.data(), variant_type_name_size);
const auto & variant_info = dynamic_column.getVariantInfo();
auto it = variant_info.variant_name_to_discriminator.find(variant_type_name);
if (it != variant_info.variant_name_to_discriminator.end())
{
const auto & variant_type = assert_cast<const DataTypeVariant &>(*variant_info.variant_type).getVariant(it->second);
deserializeVariant(dynamic_column.getVariantColumn(), variant_type, it->second, istr, [&settings](const ISerialization & serialization, IColumn & variant, ReadBuffer & buf){ serialization.deserializeBinary(variant, buf, settings); });
return;
}
/// We don't have this variant yet. Let's try to add it.
auto variant_type = DataTypeFactory::instance().get(variant_type_name);
if (dynamic_column.addNewVariant(variant_type))
{
auto discr = variant_info.variant_name_to_discriminator.at(variant_type_name);
deserializeVariant(dynamic_column.getVariantColumn(), variant_type, discr, istr, [&settings](const ISerialization & serialization, IColumn & variant, ReadBuffer & buf){ serialization.deserializeBinary(variant, buf, settings); });
return;
}
/// We reached maximum number of variants and couldn't add new variant.
/// This case should be really rare in real use cases.
/// We should always be able to add String variant and insert value as String.
dynamic_column.addStringVariant();
auto tmp_variant_column = variant_type->createColumn();
variant_type->getDefaultSerialization()->deserializeBinary(*tmp_variant_column, istr, settings);
auto string_column = castColumn(ColumnWithTypeAndName(tmp_variant_column->getPtr(), variant_type, ""), std::make_shared<DataTypeString>());
auto & variant_column = dynamic_column.getVariantColumn();
variant_column.insertIntoVariantFrom(variant_info.variant_name_to_discriminator.at("String"), *string_column, 0);
}
void SerializationDynamic::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(column);
dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextCSV(dynamic_column.getVariantColumn(), row_num, ostr, settings);
}
template <typename ReadFieldFunc, typename TryDeserializeVariantFunc, typename DeserializeVariant>
static void deserializeTextImpl(
IColumn & column,
ReadBuffer & istr,
const FormatSettings & settings,
ReadFieldFunc read_field,
FormatSettings::EscapingRule escaping_rule,
TryDeserializeVariantFunc try_deserialize_variant,
DeserializeVariant deserialize_variant)
{
auto & dynamic_column = assert_cast<ColumnDynamic &>(column);
auto & variant_column = dynamic_column.getVariantColumn();
const auto & variant_info = dynamic_column.getVariantInfo();
String field = read_field(istr);
auto field_buf = std::make_unique<ReadBufferFromString>(field);
JSONInferenceInfo json_info;
auto variant_type = tryInferDataTypeByEscapingRule(field, settings, escaping_rule, &json_info);
if (escaping_rule == FormatSettings::EscapingRule::JSON)
transformFinalInferredJSONTypeIfNeeded(variant_type, settings, &json_info);
if (checkIfTypeIsComplete(variant_type) && dynamic_column.addNewVariant(variant_type))
{
auto discr = variant_info.variant_name_to_discriminator.at(variant_type->getName());
deserializeVariant(dynamic_column.getVariantColumn(), variant_type, discr, *field_buf, deserialize_variant);
return;
}
/// We couldn't infer type or add new variant. Try to insert field into current variants.
field_buf = std::make_unique<ReadBufferFromString>(field);
if (try_deserialize_variant(*variant_info.variant_type->getDefaultSerialization(), variant_column, *field_buf))
return;
/// We couldn't insert field into any existing variant, add String variant and read value as String.
dynamic_column.addStringVariant();
if (escaping_rule == FormatSettings::EscapingRule::Quoted && (field.size() < 2 || field.front() != '\'' || field.back() != '\''))
field = "'" + field + "'";
field_buf = std::make_unique<ReadBufferFromString>(field);
auto string_discr = variant_info.variant_name_to_discriminator.at("String");
deserializeVariant(dynamic_column.getVariantColumn(), std::make_shared<DataTypeString>(), string_discr, *field_buf, deserialize_variant);
}
void SerializationDynamic::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
auto read_field = [&settings](ReadBuffer & buf)
{
String field;
readCSVField(field, buf, settings.csv);
return field;
};
auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf)
{
return serialization.tryDeserializeTextCSV(col, buf, settings);
};
auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf)
{
serialization.deserializeTextCSV(col, buf, settings);
};
deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::CSV, try_deserialize_variant, deserialize_variant);
}
bool SerializationDynamic::tryDeserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const
{
deserializeTextCSV(column, istr, settings);
return true;
}
void SerializationDynamic::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(column);
dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextEscaped(dynamic_column.getVariantColumn(), row_num, ostr, settings);
}
void SerializationDynamic::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
auto read_field = [](ReadBuffer & buf)
{
String field;
readEscapedString(field, buf);
return field;
};
auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf)
{
return serialization.tryDeserializeTextEscaped(col, buf, settings);
};
auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf)
{
serialization.deserializeTextEscaped(col, buf, settings);
};
deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::Escaped, try_deserialize_variant, deserialize_variant);
}
bool SerializationDynamic::tryDeserializeTextEscaped(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const
{
deserializeTextEscaped(column, istr, settings);
return true;
}
void SerializationDynamic::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(column);
dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextQuoted(dynamic_column.getVariantColumn(), row_num, ostr, settings);
}
void SerializationDynamic::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
auto read_field = [](ReadBuffer & buf)
{
String field;
readQuotedField(field, buf);
return field;
};
auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf)
{
return serialization.tryDeserializeTextQuoted(col, buf, settings);
};
auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf)
{
serialization.deserializeTextQuoted(col, buf, settings);
};
deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::Quoted, try_deserialize_variant, deserialize_variant);
}
bool SerializationDynamic::tryDeserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const
{
deserializeTextQuoted(column, istr, settings);
return true;
}
void SerializationDynamic::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(column);
dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextJSON(dynamic_column.getVariantColumn(), row_num, ostr, settings);
}
void SerializationDynamic::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
auto read_field = [&settings](ReadBuffer & buf)
{
String field;
readJSONField(field, buf, settings.json);
return field;
};
auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf)
{
return serialization.tryDeserializeTextJSON(col, buf, settings);
};
auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf)
{
serialization.deserializeTextJSON(col, buf, settings);
};
deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::JSON, try_deserialize_variant, deserialize_variant);
}
bool SerializationDynamic::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const
{
deserializeTextJSON(column, istr, settings);
return true;
}
void SerializationDynamic::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(column);
dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextRaw(dynamic_column.getVariantColumn(), row_num, ostr, settings);
}
void SerializationDynamic::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
auto read_field = [](ReadBuffer & buf)
{
String field;
readString(field, buf);
return field;
};
auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf)
{
return serialization.tryDeserializeTextRaw(col, buf, settings);
};
auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf)
{
serialization.deserializeTextRaw(col, buf, settings);
};
deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::Raw, try_deserialize_variant, deserialize_variant);
}
bool SerializationDynamic::tryDeserializeTextRaw(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const
{
deserializeTextRaw(column, istr, settings);
return true;
}
void SerializationDynamic::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(column);
dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeText(dynamic_column.getVariantColumn(), row_num, ostr, settings);
}
void SerializationDynamic::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
auto read_field = [](ReadBuffer & buf)
{
String field;
readStringUntilEOF(field, buf);
return field;
};
auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf)
{
return serialization.tryDeserializeWholeText(col, buf, settings);
};
auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf)
{
serialization.deserializeWholeText(col, buf, settings);
};
deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::Raw, try_deserialize_variant, deserialize_variant);
}
bool SerializationDynamic::tryDeserializeWholeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const
{
deserializeWholeText(column, istr, settings);
return true;
}
void SerializationDynamic::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(column);
dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextXML(dynamic_column.getVariantColumn(), row_num, ostr, settings);
}
}

View File

@ -0,0 +1,116 @@
#pragma once
#include <DataTypes/Serializations/ISerialization.h>
#include <Columns/ColumnDynamic.h>
namespace DB
{
class SerializationDynamicElement;
class SerializationDynamic : public ISerialization
{
public:
explicit SerializationDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_)
{
}
struct DynamicStructureSerializationVersion
{
enum Value
{
VariantTypeName = 1,
};
Value value;
static void checkVersion(UInt64 version);
explicit DynamicStructureSerializationVersion(UInt64 version);
};
void enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
static DeserializeBinaryBulkStatePtr deserializeDynamicStructureStatePrefix(
DeserializeBinaryBulkSettings & settings,
SubstreamsDeserializeStatesCache * cache);
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
private:
friend SerializationDynamicElement;
struct DeserializeBinaryBulkStateDynamicStructure : public ISerialization::DeserializeBinaryBulkState
{
DynamicStructureSerializationVersion structure_version;
DataTypePtr variant_type;
ColumnDynamic::Statistics statistics = {.source = ColumnDynamic::Statistics::Source::READ, .data = {}};
explicit DeserializeBinaryBulkStateDynamicStructure(UInt64 structure_version_) : structure_version(structure_version_) {}
};
size_t max_dynamic_types;
};
}

View File

@ -0,0 +1,119 @@
#include <DataTypes/Serializations/SerializationDynamicElement.h>
#include <DataTypes/Serializations/SerializationVariantElement.h>
#include <DataTypes/Serializations/SerializationDynamic.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/DataTypeFactory.h>
#include <Columns/ColumnDynamic.h>
#include <IO/ReadHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
struct DeserializeBinaryBulkStateDynamicElement : public ISerialization::DeserializeBinaryBulkState
{
ISerialization::DeserializeBinaryBulkStatePtr structure_state;
SerializationPtr variant_serialization;
ISerialization::DeserializeBinaryBulkStatePtr variant_element_state;
};
void SerializationDynamicElement::enumerateStreams(
DB::ISerialization::EnumerateStreamsSettings & settings,
const DB::ISerialization::StreamCallback & callback,
const DB::ISerialization::SubstreamData & data) const
{
settings.path.push_back(Substream::DynamicStructure);
callback(settings.path);
settings.path.pop_back();
/// If we didn't deserialize prefix yet, we don't know if we actually have this variant in Dynamic column,
/// so we cannot enumerate variant streams.
if (!data.deserialize_state)
return;
auto * deserialize_state = checkAndGetState<DeserializeBinaryBulkStateDynamicElement>(data.deserialize_state);
/// If we don't have this variant, no need to enumerate streams for it as we won't read from any stream.
if (!deserialize_state->variant_serialization)
return;
settings.path.push_back(Substream::DynamicData);
auto variant_data = SubstreamData(deserialize_state->variant_serialization)
.withType(data.type)
.withColumn(data.column)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state->variant_element_state);
deserialize_state->variant_serialization->enumerateStreams(settings, callback, variant_data);
settings.path.pop_back();
}
void SerializationDynamicElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationDynamicElement");
}
void SerializationDynamicElement::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationDynamicElement");
}
void SerializationDynamicElement::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const
{
DeserializeBinaryBulkStatePtr structure_state = SerializationDynamic::deserializeDynamicStructureStatePrefix(settings, cache);
if (!structure_state)
return;
auto dynamic_element_state = std::make_shared<DeserializeBinaryBulkStateDynamicElement>();
dynamic_element_state->structure_state = std::move(structure_state);
const auto & variant_type = checkAndGetState<SerializationDynamic::DeserializeBinaryBulkStateDynamicStructure>(dynamic_element_state->structure_state)->variant_type;
/// Check if we actually have required element in the Variant.
if (auto global_discr = assert_cast<const DataTypeVariant &>(*variant_type).tryGetVariantDiscriminator(dynamic_element_name))
{
settings.path.push_back(Substream::DynamicData);
dynamic_element_state->variant_serialization = std::make_shared<SerializationVariantElement>(nested_serialization, dynamic_element_name, *global_discr);
dynamic_element_state->variant_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_element_state->variant_element_state, cache);
settings.path.pop_back();
}
state = std::move(dynamic_element_state);
}
void SerializationDynamicElement::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationDynamicElement");
}
void SerializationDynamicElement::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & result_column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
if (!state)
return;
auto * dynamic_element_state = checkAndGetState<DeserializeBinaryBulkStateDynamicElement>(state);
if (dynamic_element_state->variant_serialization)
{
settings.path.push_back(Substream::DynamicData);
dynamic_element_state->variant_serialization->deserializeBinaryBulkWithMultipleStreams(result_column, limit, settings, dynamic_element_state->variant_element_state, cache);
settings.path.pop_back();
}
else
{
auto mutable_column = result_column->assumeMutable();
mutable_column->insertManyDefaults(limit);
result_column = std::move(mutable_column);
}
}
}

View File

@ -0,0 +1,58 @@
#pragma once
#include <DataTypes/Serializations/SerializationWrapper.h>
namespace DB
{
/// Serialization for Dynamic element when we read it as a subcolumn.
class SerializationDynamicElement final : public SerializationWrapper
{
private:
/// To be able to deserialize Dynamic element as a subcolumn
/// we need its type name and global discriminator.
String dynamic_element_name;
public:
SerializationDynamicElement(const SerializationPtr & nested_, const String & dynamic_element_name_)
: SerializationWrapper(nested_)
, dynamic_element_name(dynamic_element_name_)
{
}
void enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
};
}

View File

@ -29,7 +29,7 @@ void SerializationEnum<Type>::deserializeTextEscaped(IColumn & column, ReadBuffe
{
/// NOTE It would be nice to do without creating a temporary object - at least extract std::string out.
std::string field_name;
readEscapedString(field_name, istr);
settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(field_name, istr) : readEscapedString(field_name, istr);
assert_cast<ColumnType &>(column).getData().push_back(ref_enum_values.getValue(StringRef(field_name), true));
}
}

View File

@ -10,8 +10,10 @@
#include <IO/WriteHelpers.h>
#include <IO/VarInt.h>
#include "Common/PODArray.h"
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
#include "base/types.h"
namespace DB
{
@ -183,14 +185,17 @@ static inline bool tryRead(const SerializationFixedString & self, IColumn & colu
}
void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); });
read(*this, column, [&istr, &settings](ColumnFixedString::Chars & data)
{
settings.tsv.crlf_end_of_line_input ? readEscapedStringInto<ColumnFixedString::Chars,true>(data, istr) : readEscapedStringInto<ColumnFixedString::Chars,false>(data, istr);
});
}
bool SerializationFixedString::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); return true; });
return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto<PaddedPODArray<UInt8>,false>(data, istr); return true; });
}

View File

@ -68,9 +68,9 @@ void SerializationInterval::deserializeBinaryBulk(IColumn & column, ReadBuffer &
}
void SerializationInterval::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const
{
dispatch(&ISerialization::deserializeBinaryBulkStatePrefix, FormatSettings::IntervalOutputFormat::Numeric, settings, state);
dispatch(&ISerialization::deserializeBinaryBulkStatePrefix, FormatSettings::IntervalOutputFormat::Numeric, settings, state, cache);
}

View File

@ -34,7 +34,10 @@ public:
void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override;
void deserializeBinaryBulkStatePrefix(DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,

View File

@ -267,7 +267,8 @@ void SerializationLowCardinality::serializeBinaryBulkStateSuffix(
void SerializationLowCardinality::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * /*cache*/) const
{
settings.path.push_back(Substream::DictionaryKeys);
auto * stream = settings.getter(settings.path);

View File

@ -33,7 +33,8 @@ public:
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,

View File

@ -398,7 +398,8 @@ void SerializationMap::enumerateStreams(
auto next_data = SubstreamData(nested)
.withType(data.type ? assert_cast<const DataTypeMap &>(*data.type).getNestedType() : nullptr)
.withColumn(data.column ? assert_cast<const ColumnMap &>(*data.column).getNestedColumnPtr() : nullptr)
.withSerializationInfo(data.serialization_info);
.withSerializationInfo(data.serialization_info)
.withDeserializeState(data.deserialize_state);
nested->enumerateStreams(settings, callback, next_data);
}
@ -420,9 +421,10 @@ void SerializationMap::serializeBinaryBulkStateSuffix(
void SerializationMap::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const
{
nested->deserializeBinaryBulkStatePrefix(settings, state);
nested->deserializeBinaryBulkStatePrefix(settings, state, cache);
}

View File

@ -51,7 +51,8 @@ public:
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,

View File

@ -54,10 +54,11 @@ void SerializationNamed::serializeBinaryBulkStateSuffix(
void SerializationNamed::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const
{
addToPath(settings.path);
nested_serialization->deserializeBinaryBulkStatePrefix(settings, state);
nested_serialization->deserializeBinaryBulkStatePrefix(settings, state, cache);
settings.path.pop_back();
}

View File

@ -36,7 +36,8 @@ public:
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,

View File

@ -95,10 +95,11 @@ void SerializationNullable::serializeBinaryBulkStateSuffix(
void SerializationNullable::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const
{
settings.path.push_back(Substream::NullableElements);
nested->deserializeBinaryBulkStatePrefix(settings, state);
nested->deserializeBinaryBulkStatePrefix(settings, state, cache);
settings.path.pop_back();
}
@ -286,7 +287,7 @@ bool SerializationNullable::tryDeserializeNullRaw(DB::ReadBuffer & istr, const D
}
template<typename ReturnType, bool escaped>
ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization, bool & is_null)
ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization, bool & is_null)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
@ -319,10 +320,10 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr,
/// Check if we have enough data in buffer to check if it's a null.
if (istr.available() > null_representation.size())
{
auto check_for_null = [&null_representation](ReadBuffer & buf)
auto check_for_null = [&null_representation, &settings](ReadBuffer & buf)
{
auto * pos = buf.position();
if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n'))
if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n' || (settings.tsv.crlf_end_of_line_input && *buf.position() == '\r')))
return true;
buf.position() = pos;
return false;
@ -334,14 +335,14 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr,
/// Use PeekableReadBuffer to make a checkpoint before checking null
/// representation and rollback if check was failed.
PeekableReadBuffer peekable_buf(istr, true);
auto check_for_null = [&null_representation](ReadBuffer & buf_)
auto check_for_null = [&null_representation, &settings](ReadBuffer & buf_)
{
auto & buf = assert_cast<PeekableReadBuffer &>(buf_);
buf.setCheckpoint();
SCOPE_EXIT(buf.dropCheckpoint());
if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'))
return true;
if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n' || (settings.tsv.crlf_end_of_line_input && *buf.position() == '\r')))
return true;
buf.rollbackToCheckpoint();
return false;
};
@ -371,7 +372,10 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr,
if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos)
throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation "
"containing '\\t' or '\\n' may not work correctly for large input.");
"containing '\\t' or '\\n' may not work correctly for large input.");
if (settings.tsv.crlf_end_of_line_input && null_representation.find('\r') != std::string::npos)
throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation "
"containing '\\r' may not work correctly for large input.");
WriteBufferFromOwnString parsed_value;
if constexpr (escaped)

View File

@ -29,7 +29,8 @@ public:
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,

View File

@ -104,9 +104,9 @@ void SerializationObject<Parser>::deserializeWholeText(IColumn & column, ReadBuf
}
template <typename Parser>
void SerializationObject<Parser>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
void SerializationObject<Parser>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
deserializeTextImpl(column, [&](String & s) { readEscapedString(s, istr); });
deserializeTextImpl(column, [&](String & s) { settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(s, istr) : readEscapedString(s, istr); });
}
template <typename Parser>
@ -210,7 +210,8 @@ void SerializationObject<Parser>::serializeBinaryBulkStateSuffix(
template <typename Parser>
void SerializationObject<Parser>::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const
{
checkSerializationIsSupported(settings);
if (state)
@ -258,7 +259,7 @@ void SerializationObject<Parser>::deserializeBinaryBulkStatePrefix(
}
settings.path.push_back(Substream::ObjectData);
state_object->nested_serialization->deserializeBinaryBulkStatePrefix(settings, state_object->nested_state);
state_object->nested_serialization->deserializeBinaryBulkStatePrefix(settings, state_object->nested_state, cache);
settings.path.pop_back();
state = std::move(state_object);

View File

@ -41,7 +41,8 @@ public:
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,

View File

@ -152,7 +152,7 @@ void SerializationSparse::enumerateStreams(
const StreamCallback & callback,
const SubstreamData & data) const
{
const auto * column_sparse = data.column ? &assert_cast<const ColumnSparse &>(*data.column) : nullptr;
const auto * column_sparse = data.column ? typeid_cast<const ColumnSparse *>(data.column.get()) : nullptr;
size_t column_size = column_sparse ? column_sparse->size() : 0;
settings.path.push_back(Substream::SparseOffsets);
@ -170,7 +170,7 @@ void SerializationSparse::enumerateStreams(
auto next_data = SubstreamData(nested)
.withType(data.type)
.withColumn(column_sparse ? column_sparse->getValuesPtr() : nullptr)
.withColumn(column_sparse ? column_sparse->getValuesPtr() : data.column)
.withSerializationInfo(data.serialization_info);
nested->enumerateStreams(settings, callback, next_data);
@ -242,12 +242,13 @@ void SerializationSparse::serializeBinaryBulkStateSuffix(
void SerializationSparse::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const
{
auto state_sparse = std::make_shared<DeserializeStateSparse>();
settings.path.push_back(Substream::SparseElements);
nested->deserializeBinaryBulkStatePrefix(settings, state_sparse->nested);
nested->deserializeBinaryBulkStatePrefix(settings, state_sparse->nested, cache);
settings.path.pop_back();
state = std::move(state_sparse);

View File

@ -43,7 +43,8 @@ public:
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
/// Allows to write ColumnSparse and other columns in sparse serialization.
void serializeBinaryBulkWithMultipleStreams(

View File

@ -147,7 +147,6 @@ void SerializationString::serializeBinaryBulk(const IColumn & column, WriteBuffe
}
}
template <int UNROLL_TIMES>
static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnString::Offsets & offsets, ReadBuffer & istr, size_t limit)
{
@ -324,14 +323,17 @@ bool SerializationString::tryDeserializeWholeText(IColumn & column, ReadBuffer &
return read<bool>(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); return true; });
}
void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
read<void>(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); });
read<void>(column, [&](ColumnString::Chars & data)
{
settings.tsv.crlf_end_of_line_input ? readEscapedStringInto<PaddedPODArray<UInt8>,true>(data, istr) : readEscapedStringInto<PaddedPODArray<UInt8>,false>(data, istr);
});
}
bool SerializationString::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
return read<bool>(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); return true; });
return read<bool>(column, [&](ColumnString::Chars & data) { readEscapedStringInto<PaddedPODArray<UInt8>,true>(data, istr); return true; });
}
void SerializationString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const

View File

@ -549,26 +549,6 @@ bool SerializationTuple::tryDeserializeTextCSV(IColumn & column, ReadBuffer & is
return tryDeserializeText(column, rb, settings, true);
}
void SerializationTuple::enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const
{
const auto * type_tuple = data.type ? &assert_cast<const DataTypeTuple &>(*data.type) : nullptr;
const auto * column_tuple = data.column ? &assert_cast<const ColumnTuple &>(*data.column) : nullptr;
const auto * info_tuple = data.serialization_info ? &assert_cast<const SerializationInfoTuple &>(*data.serialization_info) : nullptr;
for (size_t i = 0; i < elems.size(); ++i)
{
auto next_data = SubstreamData(elems[i])
.withType(type_tuple ? type_tuple->getElement(i) : nullptr)
.withColumn(column_tuple ? column_tuple->getColumnPtr(i) : nullptr)
.withSerializationInfo(info_tuple ? info_tuple->getElementInfo(i) : nullptr);
elems[i]->enumerateStreams(settings, callback, next_data);
}
}
struct SerializeBinaryBulkStateTuple : public ISerialization::SerializeBinaryBulkState
{
std::vector<ISerialization::SerializeBinaryBulkStatePtr> states;
@ -579,6 +559,27 @@ struct DeserializeBinaryBulkStateTuple : public ISerialization::DeserializeBinar
std::vector<ISerialization::DeserializeBinaryBulkStatePtr> states;
};
void SerializationTuple::enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const
{
const auto * type_tuple = data.type ? &assert_cast<const DataTypeTuple &>(*data.type) : nullptr;
const auto * column_tuple = data.column ? &assert_cast<const ColumnTuple &>(*data.column) : nullptr;
const auto * info_tuple = data.serialization_info ? &assert_cast<const SerializationInfoTuple &>(*data.serialization_info) : nullptr;
const auto * tuple_deserialize_state = data.deserialize_state ? checkAndGetState<DeserializeBinaryBulkStateTuple>(data.deserialize_state) : nullptr;
for (size_t i = 0; i < elems.size(); ++i)
{
auto next_data = SubstreamData(elems[i])
.withType(type_tuple ? type_tuple->getElement(i) : nullptr)
.withColumn(column_tuple ? column_tuple->getColumnPtr(i) : nullptr)
.withSerializationInfo(info_tuple ? info_tuple->getElementInfo(i) : nullptr)
.withDeserializeState(tuple_deserialize_state ? tuple_deserialize_state->states[i] : nullptr);
elems[i]->enumerateStreams(settings, callback, next_data);
}
}
void SerializationTuple::serializeBinaryBulkStatePrefix(
const IColumn & column,
@ -606,13 +607,14 @@ void SerializationTuple::serializeBinaryBulkStateSuffix(
void SerializationTuple::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const
{
auto tuple_state = std::make_shared<DeserializeBinaryBulkStateTuple>();
tuple_state->states.resize(elems.size());
for (size_t i = 0; i < elems.size(); ++i)
elems[i]->deserializeBinaryBulkStatePrefix(settings, tuple_state->states[i]);
elems[i]->deserializeBinaryBulkStatePrefix(settings, tuple_state->states[i], cache);
state = std::move(tuple_state);
}

View File

@ -53,7 +53,8 @@ public:
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,

View File

@ -28,6 +28,16 @@ namespace ErrorCodes
extern const int INCORRECT_DATA;
}
struct SerializeBinaryBulkStateVariant : public ISerialization::SerializeBinaryBulkState
{
std::vector<ISerialization::SerializeBinaryBulkStatePtr> states;
};
struct DeserializeBinaryBulkStateVariant : public ISerialization::DeserializeBinaryBulkState
{
std::vector<ISerialization::DeserializeBinaryBulkStatePtr> states;
};
void SerializationVariant::enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
@ -35,6 +45,7 @@ void SerializationVariant::enumerateStreams(
{
const auto * type_variant = data.type ? &assert_cast<const DataTypeVariant &>(*data.type) : nullptr;
const auto * column_variant = data.column ? &assert_cast<const ColumnVariant &>(*data.column) : nullptr;
const auto * variant_deserialize_state = data.deserialize_state ? checkAndGetState<DeserializeBinaryBulkStateVariant>(data.deserialize_state) : nullptr;
auto discriminators_serialization = std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<ColumnVariant::Discriminator>>(), "discr", SubstreamType::NamedVariantDiscriminators);
auto local_discriminators = column_variant ? column_variant->getLocalDiscriminatorsPtr() : nullptr;
@ -59,7 +70,8 @@ void SerializationVariant::enumerateStreams(
auto variant_data = SubstreamData(variants[i])
.withType(type_variant ? type_variant->getVariant(i) : nullptr)
.withColumn(column_variant ? column_variant->getVariantPtrByGlobalDiscriminator(i) : nullptr)
.withSerializationInfo(data.serialization_info);
.withSerializationInfo(data.serialization_info)
.withDeserializeState(variant_deserialize_state ? variant_deserialize_state->states[i] : nullptr);
addVariantElementToPath(settings.path, i);
settings.path.back().data = variant_data;
@ -70,16 +82,6 @@ void SerializationVariant::enumerateStreams(
settings.path.pop_back();
}
struct SerializeBinaryBulkStateVariant : public ISerialization::SerializeBinaryBulkState
{
std::vector<ISerialization::SerializeBinaryBulkStatePtr> states;
};
struct DeserializeBinaryBulkStateVariant : public ISerialization::DeserializeBinaryBulkState
{
std::vector<ISerialization::DeserializeBinaryBulkStatePtr> states;
};
void SerializationVariant::serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
@ -123,7 +125,8 @@ void SerializationVariant::serializeBinaryBulkStateSuffix(
void SerializationVariant::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const
{
auto variant_state = std::make_shared<DeserializeBinaryBulkStateVariant>();
variant_state->states.resize(variants.size());
@ -132,7 +135,7 @@ void SerializationVariant::deserializeBinaryBulkStatePrefix(
for (size_t i = 0; i < variants.size(); ++i)
{
addVariantElementToPath(settings.path, i);
variants[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->states[i]);
variants[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->states[i], cache);
settings.path.pop_back();
}
@ -141,12 +144,13 @@ void SerializationVariant::deserializeBinaryBulkStatePrefix(
}
void SerializationVariant::serializeBinaryBulkWithMultipleStreams(
void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
SerializeBinaryBulkStatePtr & state,
std::unordered_map<String, size_t> & variants_statistics) const
{
const ColumnVariant & col = assert_cast<const ColumnVariant &>(column);
if (const size_t size = col.size(); limit == 0 || offset + limit > size)
@ -185,6 +189,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams(
{
addVariantElementToPath(settings.path, i);
variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->states[i]);
variants_statistics[variant_names[i]] += col.getVariantByGlobalDiscriminator(i).size();
settings.path.pop_back();
}
settings.path.pop_back();
@ -205,6 +210,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams(
addVariantElementToPath(settings.path, non_empty_global_discr);
/// We can use the same offset/limit as for whole Variant column
variants[non_empty_global_discr]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(non_empty_global_discr), offset, limit, settings, variant_state->states[non_empty_global_discr]);
variants_statistics[variant_names[non_empty_global_discr]] += limit;
settings.path.pop_back();
settings.path.pop_back();
return;
@ -244,12 +250,23 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams(
variant_offsets_and_limits[i].second,
settings,
variant_state->states[i]);
variants_statistics[variant_names[i]] += variant_offsets_and_limits[i].second;
settings.path.pop_back();
}
}
settings.path.pop_back();
}
void SerializationVariant::serializeBinaryBulkWithMultipleStreams(
const DB::IColumn & column,
size_t offset,
size_t limit,
DB::ISerialization::SerializeBinaryBulkSettings & settings,
DB::ISerialization::SerializeBinaryBulkStatePtr & state) const
{
std::unordered_map<String, size_t> tmp_statistics;
serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(column, offset, limit, settings, state, tmp_statistics);
}
void SerializationVariant::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
@ -599,14 +616,14 @@ void SerializationVariant::serializeTextEscaped(const IColumn & column, size_t r
bool SerializationVariant::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String field;
readEscapedString(field, istr);
settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(field, istr) : readEscapedString(field, istr);
return tryDeserializeTextEscapedImpl(column, field, settings);
}
void SerializationVariant::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String field;
readEscapedString(field, istr);
settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(field, istr) : readEscapedString(field, istr);
if (!tryDeserializeTextEscapedImpl(column, field, settings))
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse escaped value of type {} here: {}", variant_name, field);
}

View File

@ -59,7 +59,8 @@ public:
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
@ -68,6 +69,14 @@ public:
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state,
std::unordered_map<String, size_t> & variants_statistics) const;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,

View File

@ -2,6 +2,7 @@
#include <DataTypes/Serializations/SerializationNumber.h>
#include <Columns/ColumnLowCardinality.h>
#include <Columns/ColumnNullable.h>
#include <IO/ReadHelpers.h>
namespace DB
{
@ -11,34 +12,6 @@ namespace ErrorCodes
extern const int NOT_IMPLEMENTED;
}
void SerializationVariantElement::enumerateStreams(
DB::ISerialization::EnumerateStreamsSettings & settings,
const DB::ISerialization::StreamCallback & callback,
const DB::ISerialization::SubstreamData & data) const
{
/// We will need stream for discriminators during deserialization.
settings.path.push_back(Substream::VariantDiscriminators);
callback(settings.path);
settings.path.pop_back();
addVariantToPath(settings.path);
settings.path.back().data = data;
nested_serialization->enumerateStreams(settings, callback, data);
removeVariantFromPath(settings.path);
}
void SerializationVariantElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationVariantElement");
}
void SerializationVariantElement::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationVariantElement");
}
struct DeserializeBinaryBulkStateVariantElement : public ISerialization::DeserializeBinaryBulkState
{
/// During deserialization discriminators and variant streams can be shared.
@ -55,12 +28,47 @@ struct DeserializeBinaryBulkStateVariantElement : public ISerialization::Deseria
ISerialization::DeserializeBinaryBulkStatePtr variant_element_state;
};
void SerializationVariantElement::deserializeBinaryBulkStatePrefix(DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const
void SerializationVariantElement::enumerateStreams(
DB::ISerialization::EnumerateStreamsSettings & settings,
const DB::ISerialization::StreamCallback & callback,
const DB::ISerialization::SubstreamData & data) const
{
/// We will need stream for discriminators during deserialization.
settings.path.push_back(Substream::VariantDiscriminators);
callback(settings.path);
settings.path.pop_back();
const auto * deserialize_state = data.deserialize_state ? checkAndGetState<DeserializeBinaryBulkStateVariantElement>(data.deserialize_state) : nullptr;
addVariantToPath(settings.path);
auto nested_data = SubstreamData(nested_serialization)
.withType(data.type ? removeNullableOrLowCardinalityNullable(data.type) : nullptr)
.withColumn(data.column ? removeNullableOrLowCardinalityNullable(data.column) : nullptr)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state ? deserialize_state->variant_element_state : nullptr);
settings.path.back().data = nested_data;
nested_serialization->enumerateStreams(settings, callback, nested_data);
removeVariantFromPath(settings.path);
}
void SerializationVariantElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationVariantElement");
}
void SerializationVariantElement::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationVariantElement");
}
void SerializationVariantElement::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const
{
auto variant_element_state = std::make_shared<DeserializeBinaryBulkStateVariantElement>();
addVariantToPath(settings.path);
nested_serialization->deserializeBinaryBulkStatePrefix(settings, variant_element_state->variant_element_state);
nested_serialization->deserializeBinaryBulkStatePrefix(settings, variant_element_state->variant_element_state, cache);
removeVariantFromPath(settings.path);
state = std::move(variant_element_state);

View File

@ -43,7 +43,8 @@ public:
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
@ -59,12 +60,6 @@ public:
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
private:
friend SerializationVariant;
void addVariantToPath(SubstreamPath & path) const;
void removeVariantFromPath(SubstreamPath & path) const;
struct VariantSubcolumnCreator : public ISubcolumnCreator
{
const ColumnPtr local_discriminators;
@ -82,6 +77,11 @@ private:
ColumnPtr create(const ColumnPtr & prev) const override;
SerializationPtr create(const SerializationPtr & prev) const override;
};
private:
friend SerializationVariant;
void addVariantToPath(SubstreamPath & path) const;
void removeVariantFromPath(SubstreamPath & path) const;
};
}

View File

@ -29,9 +29,10 @@ void SerializationWrapper::serializeBinaryBulkStateSuffix(
void SerializationWrapper::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const
{
nested_serialization->deserializeBinaryBulkStatePrefix(settings, state);
nested_serialization->deserializeBinaryBulkStatePrefix(settings, state, cache);
}
void SerializationWrapper::serializeBinaryBulkWithMultipleStreams(

Some files were not shown because too many files have changed in this diff Show More