Backport #66444 to 24.8: Implement new JSON data type.

This commit is contained in:
robot-clickhouse 2024-08-16 13:14:20 +00:00
parent 2adfd03b48
commit 387d6a8eef
305 changed files with 17015 additions and 2289 deletions

View File

@ -5620,6 +5620,19 @@ Minimal size of block to compress in CROSS JOIN. Zero value means - disable this
Default value: `1GiB`. Default value: `1GiB`.
## use_json_alias_for_old_object_type
When enabled, `JSON` data type alias will be used to create an old [Object('json')](../../sql-reference/data-types/json.md) type instead of the new [JSON](../../sql-reference/data-types/newjson.md) type.
This setting requires server restart to take effect when changed.
Default value: `false`.
## type_json_skip_duplicated_paths
When enabled, ClickHouse will skip duplicated paths during parsing of [JSON](../../sql-reference/data-types/newjson.md) object. Only the value of the first occurrence of each path will be inserted.
Default value: `false`
## restore_replace_external_engines_to_null ## restore_replace_external_engines_to_null
For testing purposes. Replaces all external engines to Null to not initiate external connections. For testing purposes. Replaces all external engines to Null to not initiate external connections.

View File

@ -12,57 +12,59 @@ This specification describes the binary format that can be used for binary encod
The table below describes how each data type is represented in binary format. Each data type encoding consist of 1 byte that indicates the type and some optional additional information. The table below describes how each data type is represented in binary format. Each data type encoding consist of 1 byte that indicates the type and some optional additional information.
`var_uint` in the binary encoding means that the size is encoded using Variable-Length Quantity compression. `var_uint` in the binary encoding means that the size is encoded using Variable-Length Quantity compression.
| ClickHouse data type | Binary encoding | | ClickHouse data type | Binary encoding |
|--------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |-----------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `Nothing` | `0x00` | | `Nothing` | `0x00` |
| `UInt8` | `0x01` | | `UInt8` | `0x01` |
| `UInt16` | `0x02` | | `UInt16` | `0x02` |
| `UInt32` | `0x03` | | `UInt32` | `0x03` |
| `UInt64` | `0x04` | | `UInt64` | `0x04` |
| `UInt128` | `0x05` | | `UInt128` | `0x05` |
| `UInt256` | `0x06` | | `UInt256` | `0x06` |
| `Int8` | `0x07` | | `Int8` | `0x07` |
| `Int16` | `0x08` | | `Int16` | `0x08` |
| `Int32` | `0x09` | | `Int32` | `0x09` |
| `Int64` | `0x0A` | | `Int64` | `0x0A` |
| `Int128` | `0x0B` | | `Int128` | `0x0B` |
| `Int256` | `0x0C` | | `Int256` | `0x0C` |
| `Float32` | `0x0D` | | `Float32` | `0x0D` |
| `Float64` | `0x0E` | | `Float64` | `0x0E` |
| `Date` | `0x0F` | | `Date` | `0x0F` |
| `Date32` | `0x10` | | `Date32` | `0x10` |
| `DateTime` | `0x11` | | `DateTime` | `0x11` |
| `DateTime(time_zone)` | `0x12<var_uint_time_zone_name_size><time_zone_name_data>` | | `DateTime(time_zone)` | `0x12<var_uint_time_zone_name_size><time_zone_name_data>` |
| `DateTime64(P)` | `0x13<uint8_precision>` | | `DateTime64(P)` | `0x13<uint8_precision>` |
| `DateTime64(P, time_zone)` | `0x14<uint8_precision><var_uint_time_zone_name_size><time_zone_name_data>` | | `DateTime64(P, time_zone)` | `0x14<uint8_precision><var_uint_time_zone_name_size><time_zone_name_data>` |
| `String` | `0x15` | | `String` | `0x15` |
| `FixedString(N)` | `0x16<var_uint_size>` | | `FixedString(N)` | `0x16<var_uint_size>` |
| `Enum8` | `0x17<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int8_value_1>...<var_uint_name_size_N><name_data_N><int8_value_N>` | | `Enum8` | `0x17<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int8_value_1>...<var_uint_name_size_N><name_data_N><int8_value_N>` |
| `Enum16` | `0x18<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int16_little_endian_value_1>...><var_uint_name_size_N><name_data_N><int16_little_endian_value_N>` | | `Enum16` | `0x18<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int16_little_endian_value_1>...><var_uint_name_size_N><name_data_N><int16_little_endian_value_N>` |
| `Decimal32(P, S)` | `0x19<uint8_precision><uint8_scale>` | | `Decimal32(P, S)` | `0x19<uint8_precision><uint8_scale>` |
| `Decimal64(P, S)` | `0x1A<uint8_precision><uint8_scale>` | | `Decimal64(P, S)` | `0x1A<uint8_precision><uint8_scale>` |
| `Decimal128(P, S)` | `0x1B<uint8_precision><uint8_scale>` | | `Decimal128(P, S)` | `0x1B<uint8_precision><uint8_scale>` |
| `Decimal256(P, S)` | `0x1C<uint8_precision><uint8_scale>` | | `Decimal256(P, S)` | `0x1C<uint8_precision><uint8_scale>` |
| `UUID` | `0x1D` | | `UUID` | `0x1D` |
| `Array(T)` | `0x1E<nested_type_encoding>` | | `Array(T)` | `0x1E<nested_type_encoding>` |
| `Tuple(T1, ..., TN)` | `0x1F<var_uint_number_of_elements><nested_type_encoding_1>...<nested_type_encoding_N>` | | `Tuple(T1, ..., TN)` | `0x1F<var_uint_number_of_elements><nested_type_encoding_1>...<nested_type_encoding_N>` |
| `Tuple(name1 T1, ..., nameN TN)` | `0x20<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N>` | | `Tuple(name1 T1, ..., nameN TN)` | `0x20<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N>` |
| `Set` | `0x21` | | `Set` | `0x21` |
| `Interval` | `0x22<interval_kind>` (see [interval kind binary encoding](#interval-kind-binary-encoding)) | | `Interval` | `0x22<interval_kind>` (see [interval kind binary encoding](#interval-kind-binary-encoding)) |
| `Nullable(T)` | `0x23<nested_type_encoding>` | | `Nullable(T)` | `0x23<nested_type_encoding>` |
| `Function` | `0x24<var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N><return_type_encoding>` | | `Function` | `0x24<var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N><return_type_encoding>` |
| `AggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN)` | `0x25<var_uint_version><var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N>` (see [aggregate function parameter binary encoding](#aggregate-function-parameter-binary-encoding)) | | `AggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN)` | `0x25<var_uint_version><var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N>` (see [aggregate function parameter binary encoding](#aggregate-function-parameter-binary-encoding)) |
| `LowCardinality(T)` | `0x26<nested_type_encoding>` | | `LowCardinality(T)` | `0x26<nested_type_encoding>` |
| `Map(K, V)` | `0x27<key_type_encoding><value_type_encoding>` | | `Map(K, V)` | `0x27<key_type_encoding><value_type_encoding>` |
| `IPv4` | `0x28` | | `IPv4` | `0x28` |
| `IPv6` | `0x29` | | `IPv6` | `0x29` |
| `Variant(T1, ..., TN)` | `0x2A<var_uint_number_of_variants><variant_type_encoding_1>...<variant_type_encoding_N>` | | `Variant(T1, ..., TN)` | `0x2A<var_uint_number_of_variants><variant_type_encoding_1>...<variant_type_encoding_N>` |
| `Dynamic(max_types=N)` | `0x2B<uint8_max_types>` | | `Dynamic(max_types=N)` | `0x2B<uint8_max_types>` |
| `Custom type` (`Ring`, `Polygon`, etc) | `0x2C<var_uint_type_name_size><type_name_data>` | | `Custom type` (`Ring`, `Polygon`, etc) | `0x2C<var_uint_type_name_size><type_name_data>` |
| `Bool` | `0x2D` | | `Bool` | `0x2D` |
| `SimpleAggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN)` | `0x2E<var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N>` (see [aggregate function parameter binary encoding](#aggregate-function-parameter-binary-encoding)) | | `SimpleAggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN)` | `0x2E<var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N>` (see [aggregate function parameter binary encoding](#aggregate-function-parameter-binary-encoding)) |
| `Nested(name1 T1, ..., nameN TN)` | `0x2F<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N>` | | `Nested(name1 T1, ..., nameN TN)` | `0x2F<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N>` |
| `JSON(max_dynamic_paths=N, max_dynamic_types=M, path Type, SKIP skip_path, SKIP REGEXP skip_path_regexp)` | `0x30<uint8_serialization_version><var_int_max_dynamic_paths><uint8_max_dynamic_types><var_uint_number_of_typed_paths><var_uint_path_name_size_1><path_name_data_1><encoded_type_1>...<var_uint_number_of_skip_paths><var_uint_skip_path_size_1><skip_path_data_1>...<var_uint_number_of_skip_path_regexps><var_uint_skip_path_regexp_size_1><skip_path_data_regexp_1>...` |
For type `JSON` byte `uint8_serialization_version` indicates the version of the serialization. Right now the version is always 0 but can change in future if new arguments will be introduced for `JSON` type.
### Interval kind binary encoding ### Interval kind binary encoding

View File

@ -19,7 +19,8 @@ ClickHouse data types include:
- **Boolean**: ClickHouse has a [`Boolean` type](./boolean.md) - **Boolean**: ClickHouse has a [`Boolean` type](./boolean.md)
- **Strings**: [`String`](./string.md) and [`FixedString`](./fixedstring.md) - **Strings**: [`String`](./string.md) and [`FixedString`](./fixedstring.md)
- **Dates**: use [`Date`](./date.md) and [`Date32`](./date32.md) for days, and [`DateTime`](./datetime.md) and [`DateTime64`](./datetime64.md) for instances in time - **Dates**: use [`Date`](./date.md) and [`Date32`](./date32.md) for days, and [`DateTime`](./datetime.md) and [`DateTime64`](./datetime64.md) for instances in time
- **JSON**: the [`JSON` object](./json.md) stores a JSON document in a single column - **Object**: the [`Object`](./json.md) stores a JSON document in a single column (deprecated)
- **JSON**: the [`JSON` object](./newjson.md) stores a JSON document in a single column
- **UUID**: a performant option for storing [`UUID` values](./uuid.md) - **UUID**: a performant option for storing [`UUID` values](./uuid.md)
- **Low cardinality types**: use an [`Enum`](./enum.md) when you have a handful of unique values, or use [`LowCardinality`](./lowcardinality.md) when you have up to 10,000 unique values of a column - **Low cardinality types**: use an [`Enum`](./enum.md) when you have a handful of unique values, or use [`LowCardinality`](./lowcardinality.md) when you have up to 10,000 unique values of a column
- **Arrays**: any column can be defined as an [`Array` of values](./array.md) - **Arrays**: any column can be defined as an [`Array` of values](./array.md)

View File

@ -13,7 +13,7 @@ keywords: [object, data type]
Stores JavaScript Object Notation (JSON) documents in a single column. Stores JavaScript Object Notation (JSON) documents in a single column.
`JSON` is an alias for `Object('json')`. `JSON` can be used as an alias to `Object('json')` when setting [use_json_alias_for_old_object_type](../../operations/settings/settings.md#usejsonaliasforoldobjecttype) is enabled.
## Example ## Example
@ -81,3 +81,4 @@ SELECT * FROM json FORMAT JSONEachRow
- [Using JSON in ClickHouse](/docs/en/integrations/data-formats/json) - [Using JSON in ClickHouse](/docs/en/integrations/data-formats/json)
- [Getting Data Into ClickHouse - Part 2 - A JSON detour](https://clickhouse.com/blog/getting-data-into-clickhouse-part-2-json) - [Getting Data Into ClickHouse - Part 2 - A JSON detour](https://clickhouse.com/blog/getting-data-into-clickhouse-part-2-json)
-

View File

@ -0,0 +1,516 @@
---
slug: /en/sql-reference/data-types/newjson
sidebar_position: 63
sidebar_label: JSON
keywords: [json, data type]
---
# JSON
Stores JavaScript Object Notation (JSON) documents in a single column.
:::note
This feature is experimental and is not production-ready. If you need to work with JSON documents, consider using [this guide](/docs/en/integrations/data-formats/json/overview) instead.
If you want to use JSON type, set `allow_experimental_json_type = 1`.
:::
To declare a column of `JSON` type, use the following syntax:
``` sql
<column_name> JSON(max_dynamic_paths=N, max_dynamic_types=M, some.path TypeName, SKIP path.to.skip, SKIP REGEXP 'paths_regexp')
```
Where:
- `max_dynamic_paths` is an optional parameter indicating how many paths can be stored separately as subcolumns across single block of data that is stored separately (for example across single data part for MergeTree table). If this limit is exceeded, all other paths will be stored together in a single structure. Default value of `max_dynamic_paths` is `1024`.
- `max_dynamic_types` is an optional parameter between `1` and `255` indicating how many different data types can be stored inside a single path column with type `Dynamic` across single block of data that is stored separately (for example across single data part for MergeTree table). If this limit is exceeded, all new types will be converted to type `String`. Default value of `max_dynamic_types` is `32`.
- `some.path TypeName` is an optional type hint for particular path in the JSON. Such paths will be always stored as subcolumns with specified type.
- `SKIP path.to.skip` is an optional hint for particular path that should be skipped during JSON parsing. Such paths will never be stored in the JSON column. If specified path is a nested JSON object, the whole nested object will be skipped.
- `SKIP REGEXP 'path_regexp'` is an optional hint with a regular expression that is used to skip paths during JSON parsing. All paths that match this regular expression will never be stored in the JSON column.
## Creating JSON
Using `JSON` type in table column definition:
```sql
CREATE TABLE test (json JSON) ENGINE = Memory;
INSERT INTO test VALUES ('{"a" : {"b" : 42}, "c" : [1, 2, 3]}'), ('{"f" : "Hello, World!"}'), ('{"a" : {"b" : 43, "e" : 10}, "c" : [4, 5, 6]}');
SELECT json FROM test;
```
```text
┌─json────────────────────────────────────────┐
│ {"a":{"b":"42"},"c":["1","2","3"]} │
│ {"f":"Hello, World!"} │
│ {"a":{"b":"43","e":"10"},"c":["4","5","6"]} │
└─────────────────────────────────────────────┘
```
```sql
CREATE TABLE test (json JSON(a.b UInt32, SKIP a.e)) ENGINE = Memory;
INSERT INTO test VALUES ('{"a" : {"b" : 42}, "c" : [1, 2, 3]}'), ('{"f" : "Hello, World!"}'), ('{"a" : {"b" : 43, "e" : 10}, "c" : [4, 5, 6]}');
SELECT json FROM test;
```
```text
┌─json──────────────────────────────┐
│ {"a":{"b":42},"c":[1,2,3]} │
│ {"a":{"b":0},"f":"Hello, World!"} │
│ {"a":{"b":43},"c":[4,5,6]} │
└───────────────────────────────────┘
```
Using CAST from 'String':
```sql
SELECT '{"a" : {"b" : 42},"c" : [1, 2, 3], "d" : "Hello, World!"}'::JSON as json;
```
```text
┌─json───────────────────────────────────────────┐
│ {"a":{"b":42},"c":[1,2,3],"d":"Hello, World!"} │
└────────────────────────────────────────────────┘
```
CAST from named `Tuple`, `Map` and `Object('json')` to `JSON` type will be supported later.
## Reading JSON paths as subcolumns
JSON type supports reading every path as a separate subcolumn. If type of the requested path was not specified in the JSON type declaration, the subcolumn of the path will always have type [Dynamic](/docs/en/sql-reference/data-types/dynamic.md).
For example:
```sql
CREATE TABLE test (json JSON(a.b UInt32, SKIP a.e)) ENGINE = Memory;
INSERT INTO test VALUES ('{"a" : {"b" : 42, "g" : 42.42}, "c" : [1, 2, 3], "d" : "2020-01-01"}'), ('{"f" : "Hello, World!", "d" : "2020-01-02"}'), ('{"a" : {"b" : 43, "e" : 10, "g" : 43.43}, "c" : [4, 5, 6]}');
SELECT json FROM test;
```
```text
┌─json──────────────────────────────────────────────────┐
│ {"a":{"b":42,"g":42.42},"c":[1,2,3],"d":"2020-01-01"} │
│ {"a":{"b":0},"d":"2020-01-02","f":"Hello, World!"} │
│ {"a":{"b":43,"g":43.43},"c":[4,5,6]} │
└───────────────────────────────────────────────────────┘
```
```sql
SELECT json.a.b, json.a.g, json.c, json.d FROM test;
```
```text
┌─json.a.b─┬─json.a.g─┬─json.c──┬─json.d─────┐
│ 42 │ 42.42 │ [1,2,3] │ 2020-01-01 │
│ 0 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2020-01-02 │
│ 43 │ 43.43 │ [4,5,6] │ ᴺᵁᴸᴸ │
└──────────┴──────────┴─────────┴────────────┘
```
If the requested path wasn't found in the data, it will be filled with `NULL` values:
```sql
SELECT json.non.existing.path FROM test;
```
```text
┌─json.non.existing.path─┐
│ ᴺᵁᴸᴸ │
│ ᴺᵁᴸᴸ │
│ ᴺᵁᴸᴸ │
└────────────────────────┘
```
Let's check the data types of returned subcolumns:
```sql
SELECT toTypeName(json.a.b), toTypeName(json.a.g), toTypeName(json.c), toTypeName(json.d) FROM test;
```
```text
┌─toTypeName(json.a.b)─┬─toTypeName(json.a.g)─┬─toTypeName(json.c)─┬─toTypeName(json.d)─┐
│ UInt32 │ Dynamic │ Dynamic │ Dynamic │
│ UInt32 │ Dynamic │ Dynamic │ Dynamic │
│ UInt32 │ Dynamic │ Dynamic │ Dynamic │
└──────────────────────┴──────────────────────┴────────────────────┴────────────────────┘
```
As we can see, for `a.b` the type is `UInt32` as we specified in the JSON type declaration, and for all other subcolumns the type is `Dynamic`.
It is also possible to read subcolumns of a `Dynamic` type using special syntax `json.some.path.:TypeName`:
```sql
select json.a.g.:Float64, dynamicType(json.a.g), json.d.:Date, dynamicType(json.d) FROM test;
```
```text
┌─json.a.g.:`Float64`─┬─dynamicType(json.a.g)─┬─json.d.:`Date`─┬─dynamicType(json.d)─┐
│ 42.42 │ Float64 │ 2020-01-01 │ Date │
│ ᴺᵁᴸᴸ │ None │ 2020-01-02 │ Date │
│ 43.43 │ Float64 │ ᴺᵁᴸᴸ │ None │
└─────────────────────┴───────────────────────┴────────────────┴─────────────────────┘
```
`Dynamic` subcolumns can be casted to any data type. In this case the exception will be thrown if internal type inside `Dynamic` cannot be casted to the requested type:
```sql
select json.a.g::UInt64 as uint FROM test;
```
```text
┌─uint─┐
│ 42 │
│ 0 │
│ 43 │
└──────┘
```
```sql
select json.a.g::UUID as float FROM test;
```
```text
Received exception:
Code: 48. DB::Exception: Conversion between numeric types and UUID is not supported. Probably the passed UUID is unquoted: while executing 'FUNCTION CAST(__table1.json.a.g :: 2, 'UUID'_String :: 1) -> CAST(__table1.json.a.g, 'UUID'_String) UUID : 0'. (NOT_IMPLEMENTED)
```
## Reading JSON sub-objects as subcolumns
JSON type supports reading nested objects as subcolumns with type `JSON` using special syntax `json.^some.path`:
```sql
CREATE TABLE test (json JSON) ENGINE = Memory;
INSERT INTO test VALUES ('{"a" : {"b" : {"c" : 42, "g" : 42.42}}, "c" : [1, 2, 3], "d" : {"e" : {"f" : {"g" : "Hello, World", "h" : [1, 2, 3]}}}}'), ('{"f" : "Hello, World!", "d" : {"e" : {"f" : {"h" : [4, 5, 6]}}}}'), ('{"a" : {"b" : {"c" : 43, "e" : 10, "g" : 43.43}}, "c" : [4, 5, 6]}');
SELECT json FROM test;
```
```text
┌─json────────────────────────────────────────────────────────────────────────────────────────┐
│ {"a":{"b":{"c":42,"g":42.42}},"c":[1,2,3],"d":{"e":{"f":{"g":"Hello, World","h":[1,2,3]}}}} │
│ {"d":{"e":{"f":{"h":[4,5,6]}}},"f":"Hello, World!"} │
│ {"a":{"b":{"c":43,"e":10,"g":43.43}},"c":[4,5,6]} │
└─────────────────────────────────────────────────────────────────────────────────────────────┘
```
```sql
SELECT json.^a.b, json.^d.e.f FROM test;
```
```text
┌─json.^`a`.b───────────────┬─json.^`d`.e.f────────────────────┐
│ {"c":42,"g":42.42} │ {"g":"Hello, World","h":[1,2,3]} │
│ {} │ {"h":[4,5,6]} │
│ {"c":43,"e":10,"g":43.43} │ {} │
└───────────────────────────┴──────────────────────────────────┘
```
:::note
Reading sub-objects as subcolumns may be inefficient, as this may require almost full scan of the JSON data.
:::
## Types inference for paths
During JSON parsing ClickHouse tries to detect the most appropriate data type for each JSON path. It works similar to [automatic schema inference from input data](/docs/en/interfaces/schema-inference.md) and controlled by the same settings:
- [input_format_try_infer_integers](/docs/en/interfaces/schema-inference.md#inputformattryinferintegers)
- [input_format_try_infer_dates](/docs/en/interfaces/schema-inference.md#inputformattryinferdates)
- [input_format_try_infer_datetimes](/docs/en/interfaces/schema-inference.md#inputformattryinferdatetimes)
- [schema_inference_make_columns_nullable](/docs/en/interfaces/schema-inference.md#schemainferencemakecolumnsnullable)
- [input_format_json_try_infer_numbers_from_strings](/docs/en/interfaces/schema-inference.md#inputformatjsontryinfernumbersfromstrings)
- [input_format_json_infer_incomplete_types_as_strings](/docs/en/interfaces/schema-inference.md#inputformatjsoninferincompletetypesasstrings)
- [input_format_json_read_numbers_as_strings](/docs/en/interfaces/schema-inference.md#inputformatjsonreadnumbersasstrings)
- [input_format_json_read_bools_as_strings](/docs/en/interfaces/schema-inference.md#inputformatjsonreadboolsasstrings)
- [input_format_json_read_bools_as_numbers](/docs/en/interfaces/schema-inference.md#inputformatjsonreadboolsasnumbers)
- [input_format_json_read_arrays_as_strings](/docs/en/interfaces/schema-inference.md#inputformatjsonreadarraysasstrings)
Let's see some examples:
```sql
SELECT JSONAllPathsWithTypes('{"a" : "2020-01-01", "b" : "2020-01-01 10:00:00"}'::JSON) AS paths_with_types settings input_format_try_infer_dates=1, input_format_try_infer_datetimes=1;
```
```text
┌─paths_with_types─────────────────┐
│ {'a':'Date','b':'DateTime64(9)'} │
└──────────────────────────────────┘
```
```sql
SELECT JSONAllPathsWithTypes('{"a" : "2020-01-01", "b" : "2020-01-01 10:00:00"}'::JSON) AS paths_with_types settings input_format_try_infer_dates=0, input_format_try_infer_datetimes=0;
```
```text
┌─paths_with_types────────────┐
│ {'a':'String','b':'String'} │
└─────────────────────────────┘
```
```sql
SELECT JSONAllPathsWithTypes('{"a" : [1, 2, 3]}'::JSON) AS paths_with_types settings schema_inference_make_columns_nullable=1;
```
```text
┌─paths_with_types───────────────┐
│ {'a':'Array(Nullable(Int64))'} │
└────────────────────────────────┘
```
```sql
SELECT JSONAllPathsWithTypes('{"a" : [1, 2, 3]}'::JSON) AS paths_with_types settings schema_inference_make_columns_nullable=0;
```
```text
┌─paths_with_types─────┐
│ {'a':'Array(Int64)'} │
└──────────────────────┘
```
## Handling arrays of JSON objects
JSON paths that contains an array of objects are parsed as type `Array(JSON)` and inserted into `Dynamic` column for this path. To read an array of objects you can extract it from `Dynamic` column as a subcolumn:
```sql
CREATE TABLE test (json JSON) ENGINE = Memory;
INSERT INTO test VALUES
('{"a" : {"b" : [{"c" : 42, "d" : "Hello", "f" : [[{"g" : 42.42}]], "k" : {"j" : 1000}}, {"c" : 43}, {"e" : [1, 2, 3], "d" : "My", "f" : [[{"g" : 43.43, "h" : "2020-01-01"}]], "k" : {"j" : 2000}}]}}'),
('{"a" : {"b" : [1, 2, 3]}}'),
('{"a" : {"b" : [{"c" : 44, "f" : [[{"h" : "2020-01-02"}]]}, {"e" : [4, 5, 6], "d" : "World", "f" : [[{"g" : 44.44}]], "k" : {"j" : 3000}}]}}');
SELECT json FROM test;
```
```text3
┌─json────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ {"a":{"b":[{"c":"42","d":"Hello","f":[[{"g":42.42}]],"k":{"j":"1000"}},{"c":"43"},{"d":"My","e":["1","2","3"],"f":[[{"g":43.43,"h":"2020-01-01"}]],"k":{"j":"2000"}}]}} │
│ {"a":{"b":["1","2","3"]}} │
│ {"a":{"b":[{"c":"44","f":[[{"h":"2020-01-02"}]]},{"d":"World","e":["4","5","6"],"f":[[{"g":44.44}]],"k":{"j":"3000"}}]}} │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
```
```sql
SELECT json.a.b, dynamicType(json.a.b) FROM test;
```
```text
┌─json.a.b──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─dynamicType(json.a.b)────────────────────────────────────┐
│ ['{"c":"42","d":"Hello","f":[[{"g":42.42}]],"k":{"j":"1000"}}','{"c":"43"}','{"d":"My","e":["1","2","3"],"f":[[{"g":43.43,"h":"2020-01-01"}]],"k":{"j":"2000"}}'] │ Array(JSON(max_dynamic_types=16, max_dynamic_paths=256)) │
│ [1,2,3] │ Array(Nullable(Int64)) │
│ ['{"c":"44","f":[[{"h":"2020-01-02"}]]}','{"d":"World","e":["4","5","6"],"f":[[{"g":44.44}]],"k":{"j":"3000"}}'] │ Array(JSON(max_dynamic_types=16, max_dynamic_paths=256)) │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────┘
```
As you can notice, the `max_dynamic_types/max_dynamic_paths` parameters of the nested `JSON` type were reduced compared to the default values. It's needed to avoid number of subcolumns to grow uncontrolled on nested arrays of JSON objects.
Let's try to read subcolumns from this nested `JSON` column:
```sql
SELECT json.a.b.:`Array(JSON)`.c, json.a.b.:`Array(JSON)`.f, json.a.b.:`Array(JSON)`.d FROM test;
```
```text
┌─json.a.b.:`Array(JSON)`.c─┬─json.a.b.:`Array(JSON)`.f───────────────────────────────────┬─json.a.b.:`Array(JSON)`.d─┐
│ [42,43,NULL] │ [[['{"g":42.42}']],NULL,[['{"g":43.43,"h":"2020-01-01"}']]] │ ['Hello',NULL,'My'] │
│ [] │ [] │ [] │
│ [44,NULL] │ [[['{"h":"2020-01-02"}']],[['{"g":44.44}']]] │ [NULL,'World'] │
└───────────────────────────┴─────────────────────────────────────────────────────────────┴───────────────────────────┘
```
We can avoid writing `Array(JSON)` subcolumn name using special syntax:
```sql
SELECT json.a.b[].c, json.a.b[].f, json.a.b[].d FROM test;
```
```text
┌─json.a.b.:`Array(JSON)`.c─┬─json.a.b.:`Array(JSON)`.f───────────────────────────────────┬─json.a.b.:`Array(JSON)`.d─┐
│ [42,43,NULL] │ [[['{"g":42.42}']],NULL,[['{"g":43.43,"h":"2020-01-01"}']]] │ ['Hello',NULL,'My'] │
│ [] │ [] │ [] │
│ [44,NULL] │ [[['{"h":"2020-01-02"}']],[['{"g":44.44}']]] │ [NULL,'World'] │
└───────────────────────────┴─────────────────────────────────────────────────────────────┴───────────────────────────┘
```
The number of `[]` after path indicates the array level. `json.path[][]` will be transformed to `json.path.:Array(Array(JSON))`
Let's check the paths and types inside our `Array(JSON)`:
```sql
SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(json.a.b[]))) FROM test;
```
```text
┌─arrayJoin(JSONAllPathsWithTypes(arrayJoin(json.a.b.:`Array(JSON)`)))──┐
│ ('c','Int64') │
│ ('d','String') │
│ ('f','Array(Array(JSON(max_dynamic_types=8, max_dynamic_paths=64)))') │
│ ('k.j','Int64') │
│ ('e','Array(Nullable(Int64))') │
└───────────────────────────────────────────────────────────────────────┘
```
Let's read subcolumns from `Array(JSON)` column:
```sql
SELECT json.a.b[].c.:Int64, json.a.b[].f[][].g.:Float64, json.a.b[].f[][].h.:Date FROM test;
```
```text
┌─json.a.b.:`Array(JSON)`.c.:`Int64`─┬─json.a.b.:`Array(JSON)`.f.:`Array(Array(JSON))`.g.:`Float64`─┬─json.a.b.:`Array(JSON)`.f.:`Array(Array(JSON))`.h.:`Date`─┐
│ [42,43,NULL] │ [[[42.42]],[],[[43.43]]] │ [[[NULL]],[],[['2020-01-01']]] │
│ [] │ [] │ [] │
│ [44,NULL] │ [[[NULL]],[[44.44]]] │ [[['2020-01-02']],[[NULL]]] │
└────────────────────────────────────┴──────────────────────────────────────────────────────────────┴───────────────────────────────────────────────────────────┘
```
We can also read sub-object subcolumns from nested `JSON` column:
```sql
SELECT json.a.b[].^k FROM test
```
```text
┌─json.a.b.:`Array(JSON)`.^`k`─────────┐
│ ['{"j":"1000"}','{}','{"j":"2000"}'] │
│ [] │
│ ['{}','{"j":"3000"}'] │
└──────────────────────────────────────┘
```
## Reading JSON type from the data
All text formats (JSONEachRow, TSV, CSV, CustomSeparated, Values, etc) supports reading `JSON` type.
Examples:
```sql
SELECT json FROM format(JSONEachRow, 'json JSON(a.b.c UInt32, SKIP a.b.d, SKIP d.e, SKIP REGEXP \'b.*\')', '
{"json" : {"a" : {"b" : {"c" : 1, "d" : [0, 1]}}, "b" : "2020-01-01", "c" : 42, "d" : {"e" : {"f" : ["s1", "s2"]}, "i" : [1, 2, 3]}}}
{"json" : {"a" : {"b" : {"c" : 2, "d" : [2, 3]}}, "b" : [1, 2, 3], "c" : null, "d" : {"e" : {"g" : 43}, "i" : [4, 5, 6]}}}
{"json" : {"a" : {"b" : {"c" : 3, "d" : [4, 5]}}, "b" : {"c" : 10}, "e" : "Hello, World!"}}
{"json" : {"a" : {"b" : {"c" : 4, "d" : [6, 7]}}, "c" : 43}}
{"json" : {"a" : {"b" : {"c" : 5, "d" : [8, 9]}}, "b" : {"c" : 11, "j" : [1, 2, 3]}, "d" : {"e" : {"f" : ["s3", "s4"], "g" : 44}, "h" : "2020-02-02 10:00:00"}}}
')
```
```text
┌─json──────────────────────────────────────────────────────────┐
│ {"a":{"b":{"c":1}},"c":"42","d":{"i":["1","2","3"]}} │
│ {"a":{"b":{"c":2}},"d":{"i":["4","5","6"]}} │
│ {"a":{"b":{"c":3}},"e":"Hello, World!"} │
│ {"a":{"b":{"c":4}},"c":"43"} │
│ {"a":{"b":{"c":5}},"d":{"h":"2020-02-02 10:00:00.000000000"}} │
└───────────────────────────────────────────────────────────────┘
```
For text formats like CSV/TSV/etc `JSON` is parsed from a string containing JSON object
```sql
SELECT json FROM format(TSV, 'json JSON(a.b.c UInt32, SKIP a.b.d, SKIP REGEXP \'b.*\')',
'{"a" : {"b" : {"c" : 1, "d" : [0, 1]}}, "b" : "2020-01-01", "c" : 42, "d" : {"e" : {"f" : ["s1", "s2"]}, "i" : [1, 2, 3]}}
{"a" : {"b" : {"c" : 2, "d" : [2, 3]}}, "b" : [1, 2, 3], "c" : null, "d" : {"e" : {"g" : 43}, "i" : [4, 5, 6]}}
{"a" : {"b" : {"c" : 3, "d" : [4, 5]}}, "b" : {"c" : 10}, "e" : "Hello, World!"}
{"a" : {"b" : {"c" : 4, "d" : [6, 7]}}, "c" : 43}
{"a" : {"b" : {"c" : 5, "d" : [8, 9]}}, "b" : {"c" : 11, "j" : [1, 2, 3]}, "d" : {"e" : {"f" : ["s3", "s4"], "g" : 44}, "h" : "2020-02-02 10:00:00"}}')
```
```text
┌─json──────────────────────────────────────────────────────────┐
│ {"a":{"b":{"c":1}},"c":"42","d":{"i":["1","2","3"]}} │
│ {"a":{"b":{"c":2}},"d":{"i":["4","5","6"]}} │
│ {"a":{"b":{"c":3}},"e":"Hello, World!"} │
│ {"a":{"b":{"c":4}},"c":"43"} │
│ {"a":{"b":{"c":5}},"d":{"h":"2020-02-02 10:00:00.000000000"}} │
└───────────────────────────────────────────────────────────────┘
```
## Reaching the limit of dynamic paths inside JSON
`JSON` data type can store only limited number of paths as separate subcolumns inside. By default, this limit is 1024, but you can change it in type declaration using parameter `max_dynamic_paths`.
When the limit is reached, all new paths inserted to `JSON` column will be stored in a single shared data structure. It's still possible to read such paths as subcolumns, but it will require reading the whole
shared data structure to extract the values of this path. This limit is needed to avoid the enormous number of different subcolumns that can make the table unusable.
Let's see what happens when the limit is reached in different scenarios.
### Reaching the limit during data parsing
During parsing of `JSON` object from the data, when the limit is reached for current block of data, all new paths will be stored in a shared data structure. We can check it using introspection functions `JSONDynamicPaths, JSONSharedDataPaths`:
```sql
SELECT json, JSONDynamicPaths(json), JSONSharedDataPaths(json) FROM format(JSONEachRow, 'json JSON(max_dynamic_paths=3)', '
{"json" : {"a" : {"b" : 42}, "c" : [1, 2, 3]}}
{"json" : {"a" : {"b" : 43}, "d" : "2020-01-01"}}
{"json" : {"a" : {"b" : 44}, "c" : [4, 5, 6]}}
{"json" : {"a" : {"b" : 43}, "d" : "2020-01-02", "e" : "Hello", "f" : {"g" : 42.42}}}
{"json" : {"a" : {"b" : 43}, "c" : [7, 8, 9], "f" : {"g" : 43.43}, "h" : "World"}}
')
```
```text
┌─json───────────────────────────────────────────────────────────┬─JSONDynamicPaths(json)─┬─JSONSharedDataPaths(json)─┐
│ {"a":{"b":"42"},"c":["1","2","3"]} │ ['a.b','c','d'] │ [] │
│ {"a":{"b":"43"},"d":"2020-01-01"} │ ['a.b','c','d'] │ [] │
│ {"a":{"b":"44"},"c":["4","5","6"]} │ ['a.b','c','d'] │ [] │
│ {"a":{"b":"43"},"d":"2020-01-02","e":"Hello","f":{"g":42.42}} │ ['a.b','c','d'] │ ['e','f.g'] │
│ {"a":{"b":"43"},"c":["7","8","9"],"f":{"g":43.43},"h":"World"} │ ['a.b','c','d'] │ ['f.g','h'] │
└────────────────────────────────────────────────────────────────┴────────────────────────┴───────────────────────────┘
```
As we can see, after inserting paths `e` and `f.g` the limit was reached and we inserted them into shared data structure.
### During merges of data parts in MergeTree table engines
During merge of several data parts in MergeTree table the `JSON` column in the resulting data part can reach the limit of dynamic paths won't be able to store all paths from source parts as subcolumns.
In this case ClickHouse chooses what paths will remain as subcolumns after merge and what types will be stored in the shared data structure. In most cases ClickHouse tries to keep paths that contains
the largest number of non-null values and move the rarest paths to the shared data structure, but it depends on the implementation.
Let's see an example of such merge. First, let's create a table with `JSON` column, set the limit of dynamic paths to `3` and insert values with `5` different paths:
```sql
CREATE TABLE test (id UInt64, json JSON(max_dynamic_paths=3)) engine=MergeTree ORDER BY id;
SYSTEM STOP MERGES test;
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as a) FROM numbers(5);
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as b) FROM numbers(4);
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as c) FROM numbers(3);
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as d) FROM numbers(2);
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as e) FROM numbers(1);
```
Each insert will create a separate data pert with `JSON` column containing single path:
```sql
SELECT count(), JSONDynamicPaths(json) AS dynamic_paths, JSONSharedDataPaths(json) AS shared_data_paths, _part FROM test GROUP BY _part, dynamic_paths, shared_data_paths ORDER BY _part ASC
```
```text
┌─count()─┬─dynamic_paths─┬─shared_data_paths─┬─_part─────┐
│ 5 │ ['a'] │ [] │ all_1_1_0 │
│ 4 │ ['b'] │ [] │ all_2_2_0 │
│ 3 │ ['c'] │ [] │ all_3_3_0 │
│ 2 │ ['d'] │ [] │ all_4_4_0 │
│ 1 │ ['e'] │ [] │ all_5_5_0 │
└─────────┴───────────────┴───────────────────┴───────────┘
```
Now, let's merge all parts into one and see what will happen:
```sql
SYSTEM START MERGES test;
OPTIMIZE TABLE test FINAL;
SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) ORDER BY _part;
```
```text
┌─count()─┬─dynamic_paths─┬─shared_data_paths─┬─_part─────┐
│ 1 │ ['a','b','c'] │ ['e'] │ all_1_5_2 │
│ 2 │ ['a','b','c'] │ ['d'] │ all_1_5_2 │
│ 12 │ ['a','b','c'] │ [] │ all_1_5_2 │
└─────────┴───────────────┴───────────────────┴───────────┘
```
As we can see, ClickHouse kept the most frequent paths `a`, `b` and `c` and moved paths `e` and `d` to shared data structure.
## Introspection functions
There are several functions that can help to inspect the content of the JSON column: [JSONAllPaths](../functions/json-functions.md#jsonallpaths), [JSONAllPathsWithTypes](../functions/json-functions.md#jsonallpathswithtypes), [JSONDynamicPaths](../functions/json-functions.md#jsondynamicpaths), [JSONDynamicPathsWithTypes](../functions/json-functions.md#jsondynamicpathswithtypes), [JSONSharedDataPaths](../functions/json-functions.md#jsonshareddatapaths), [JSONSharedDataPathsWithTypes](../functions/json-functions.md#jsonshareddatapathswithtypes).
## Tips for better usage of the JSON type
Before creating `JSON` column and loading data into it, consider the following tips:
- Investigate your data and specify as many path hints with types as you can. It will make the storage and the reading much more efficient.
- Think about what paths you will need and what paths you will never need. Specify paths that you won't need in the SKIP section and SKIP REGEXP if needed. It will improve the storage.
- Don't set `max_dynamic_paths` parameter to very high values, it can make the storage and reading less efficient.

View File

@ -1155,3 +1155,207 @@ SELECT jsonMergePatch('{"a":1}', '{"name": "joey"}', '{"name": "tom"}', '{"name"
│ {"a":1,"name":"zoey"} │ │ {"a":1,"name":"zoey"} │
└───────────────────────┘ └───────────────────────┘
``` ```
### JSONAllPaths
Returns the list of all paths stored in each row in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONAllPaths(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Array(String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONAllPaths(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONAllPaths(json)─┐
│ {"a":"42"} │ ['a'] │
│ {"b":"Hello"} │ ['b'] │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ ['a','c'] │
└──────────────────────────────────────┴────────────────────┘
```
### JSONAllPathsWithTypes
Returns the map of all paths and their data types stored in each row in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONAllPathsWithTypes(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Map(String, String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONAllPathsWithTypes(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONAllPathsWithTypes(json)───────────────┐
│ {"a":"42"} │ {'a':'Int64'} │
│ {"b":"Hello"} │ {'b':'String'} │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ {'a':'Array(Nullable(Int64))','c':'Date'} │
└──────────────────────────────────────┴───────────────────────────────────────────┘
```
### JSONDynamicPaths
Returns the list of dynamic paths that are stored as separate subcolumns in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONDynamicPaths(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Array(String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONDynamicPaths(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONDynamicPaths(json)─┐
| {"a":"42"} │ ['a'] │
│ {"b":"Hello"} │ [] │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ ['a'] │
└──────────────────────────────────────┴────────────────────────┘
```
### JSONDynamicPathsWithTypes
Returns the map of dynamic paths that are stored as separate subcolumns and their types in each row in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONAllPathsWithTypes(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Map(String, String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONDynamicPathsWithTypes(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONDynamicPathsWithTypes(json)─┐
│ {"a":"42"} │ {'a':'Int64'} │
│ {"b":"Hello"} │ {} │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ {'a':'Array(Nullable(Int64))'} │
└──────────────────────────────────────┴─────────────────────────────────┘
```
### JSONSharedDataPaths
Returns the list of paths that are stored in shared data structure in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONSharedDataPaths(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Array(String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONSharedDataPaths(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONSharedDataPaths(json)─┐
│ {"a":"42"} │ [] │
│ {"b":"Hello"} │ ['b'] │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ ['c'] │
└──────────────────────────────────────┴───────────────────────────┘
```
### JSONSharedDataPathsWithTypes
Returns the map of paths that are stored in shared data structure and their types in each row in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONSharedDataPathsWithTypes(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Map(String, String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONSharedDataPathsWithTypes(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONSharedDataPathsWithTypes(json)─┐
│ {"a":"42"} │ {} │
│ {"b":"Hello"} │ {'b':'String'} │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ {'c':'Date'} │
└──────────────────────────────────────┴────────────────────────────────────┘
```

View File

@ -175,6 +175,11 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
hash_func.update(options["seed"].as<std::string>()); hash_func.update(options["seed"].as<std::string>());
} }
SharedContextHolder shared_context = Context::createShared();
auto context = Context::createGlobal(shared_context.get());
auto context_const = WithContext(context).getContext();
context->makeGlobalContext();
registerInterpreters(); registerInterpreters();
registerFunctions(); registerFunctions();
registerAggregateFunctions(); registerAggregateFunctions();

View File

@ -1,5 +1,5 @@
#include <DataTypes/DataTypeString.h> #include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeObject.h> #include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/DataTypeArray.h> #include <DataTypes/DataTypeArray.h>
#include <DataTypes/NestedUtils.h> #include <DataTypes/NestedUtils.h>
@ -452,10 +452,10 @@ QueryTreeNodePtr IdentifierResolver::tryResolveIdentifierFromCompoundExpression(
if (auto * column = compound_expression->as<ColumnNode>()) if (auto * column = compound_expression->as<ColumnNode>())
{ {
const DataTypePtr & column_type = column->getColumn().getTypeInStorage(); const DataTypePtr & column_type = column->getColumn().getTypeInStorage();
if (column_type->getTypeId() == TypeIndex::Object) if (column_type->getTypeId() == TypeIndex::ObjectDeprecated)
{ {
const auto * object_type = checkAndGetDataType<DataTypeObject>(column_type.get()); const auto & object_type = checkAndGetDataType<DataTypeObjectDeprecated>(*column_type);
if (object_type->getSchemaFormat() == "json" && object_type->hasNullableSubcolumns()) if (object_type.getSchemaFormat() == "json" && object_type.hasNullableSubcolumns())
{ {
QueryTreeNodePtr constant_node_null = std::make_shared<ConstantNode>(Field()); QueryTreeNodePtr constant_node_null = std::make_shared<ConstantNode>(Field());
return constant_node_null; return constant_node_null;
@ -1000,7 +1000,6 @@ QueryTreeNodePtr IdentifierResolver::tryResolveIdentifierFromJoin(const Identifi
if (!join_node_in_resolve_process && from_join_node.isUsingJoinExpression()) if (!join_node_in_resolve_process && from_join_node.isUsingJoinExpression())
{ {
auto & join_using_list = from_join_node.getJoinExpression()->as<ListNode &>(); auto & join_using_list = from_join_node.getJoinExpression()->as<ListNode &>();
for (auto & join_using_node : join_using_list.getNodes()) for (auto & join_using_node : join_using_list.getNodes())
{ {
auto & column_node = join_using_node->as<ColumnNode &>(); auto & column_node = join_using_node->as<ColumnNode &>();

View File

@ -3,7 +3,7 @@
#include <DataTypes/DataTypesNumber.h> #include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h> #include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeObject.h> #include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/DataTypeTuple.h> #include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeArray.h> #include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeMap.h> #include <DataTypes/DataTypeMap.h>

View File

@ -452,6 +452,11 @@ void ColumnArray::reserve(size_t n)
getData().reserve(n); /// The average size of arrays is not taken into account here. Or it is considered to be no more than 1. getData().reserve(n); /// The average size of arrays is not taken into account here. Or it is considered to be no more than 1.
} }
size_t ColumnArray::capacity() const
{
return getOffsets().capacity();
}
void ColumnArray::prepareForSquashing(const Columns & source_columns) void ColumnArray::prepareForSquashing(const Columns & source_columns)
{ {
size_t new_size = size(); size_t new_size = size();

View File

@ -118,6 +118,7 @@ public:
void updatePermutationWithCollation(const Collator & collator, PermutationSortDirection direction, PermutationSortStability stability, void updatePermutationWithCollation(const Collator & collator, PermutationSortDirection direction, PermutationSortStability stability,
size_t limit, int nan_direction_hint, Permutation & res, EqualRanges& equal_ranges) const override; size_t limit, int nan_direction_hint, Permutation & res, EqualRanges& equal_ranges) const override;
void reserve(size_t n) override; void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override; void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override; void shrinkToFit() override;
void ensureOwnership() override; void ensureOwnership() override;

View File

@ -53,6 +53,7 @@ public:
size_t allocatedBytes() const override { return data.allocated_bytes(); } size_t allocatedBytes() const override { return data.allocated_bytes(); }
void protect() override { data.protect(); } void protect() override { data.protect(); }
void reserve(size_t n) override { data.reserve_exact(n); } void reserve(size_t n) override { data.reserve_exact(n); }
size_t capacity() const override { return data.capacity(); }
void shrinkToFit() override { data.shrink_to_fit(); } void shrinkToFit() override { data.shrink_to_fit(); }
#if !defined(DEBUG_OR_SANITIZER_BUILD) #if !defined(DEBUG_OR_SANITIZER_BUILD)

View File

@ -16,7 +16,6 @@
#include <IO/ReadBufferFromMemory.h> #include <IO/ReadBufferFromMemory.h>
#include <IO/WriteBufferFromString.h> #include <IO/WriteBufferFromString.h>
#include <Formats/FormatSettings.h> #include <Formats/FormatSettings.h>
#include <Common/logger_useful.h>
namespace DB namespace DB
{ {
@ -56,6 +55,7 @@ ColumnDynamic::ColumnDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_
ColumnDynamic::ColumnDynamic( ColumnDynamic::ColumnDynamic(
MutableColumnPtr variant_column_, const DataTypePtr & variant_type_, size_t max_dynamic_types_, size_t global_max_dynamic_types_, const StatisticsPtr & statistics_) MutableColumnPtr variant_column_, const DataTypePtr & variant_type_, size_t max_dynamic_types_, size_t global_max_dynamic_types_, const StatisticsPtr & statistics_)
: variant_column(std::move(variant_column_)) : variant_column(std::move(variant_column_))
, variant_column_ptr(assert_cast<ColumnVariant *>(variant_column.get()))
, max_dynamic_types(max_dynamic_types_) , max_dynamic_types(max_dynamic_types_)
, global_max_dynamic_types(global_max_dynamic_types_) , global_max_dynamic_types(global_max_dynamic_types_)
, statistics(statistics_) , statistics(statistics_)
@ -66,6 +66,7 @@ ColumnDynamic::ColumnDynamic(
ColumnDynamic::ColumnDynamic( ColumnDynamic::ColumnDynamic(
MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, size_t global_max_dynamic_types_, const StatisticsPtr & statistics_) MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, size_t global_max_dynamic_types_, const StatisticsPtr & statistics_)
: variant_column(std::move(variant_column_)) : variant_column(std::move(variant_column_))
, variant_column_ptr(assert_cast<ColumnVariant *>(variant_column.get()))
, variant_info(variant_info_) , variant_info(variant_info_)
, max_dynamic_types(max_dynamic_types_) , max_dynamic_types(max_dynamic_types_)
, global_max_dynamic_types(global_max_dynamic_types_) , global_max_dynamic_types(global_max_dynamic_types_)
@ -79,6 +80,7 @@ void ColumnDynamic::setVariantType(const DataTypePtr & variant_type)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Setting specific variant type is allowed only for empty dynamic column"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Setting specific variant type is allowed only for empty dynamic column");
variant_column = variant_type->createColumn(); variant_column = variant_type->createColumn();
variant_column_ptr = assert_cast<ColumnVariant *>(variant_column.get());
createVariantInfo(variant_type); createVariantInfo(variant_type);
} }
@ -313,12 +315,12 @@ void ColumnDynamic::doInsertFrom(const IColumn & src_, size_t n)
/// Check if we have the same variants in both columns. /// Check if we have the same variants in both columns.
if (variant_info.variant_name == dynamic_src.variant_info.variant_name) if (variant_info.variant_name == dynamic_src.variant_info.variant_name)
{ {
variant_column->insertFrom(*dynamic_src.variant_column, n); variant_column_ptr->insertFrom(*dynamic_src.variant_column, n);
return; return;
} }
auto & variant_col = assert_cast<ColumnVariant &>(*variant_column); auto & variant_col = getVariantColumn();
const auto & src_variant_col = assert_cast<const ColumnVariant &>(*dynamic_src.variant_column); const auto & src_variant_col = dynamic_src.getVariantColumn();
auto src_global_discr = src_variant_col.globalDiscriminatorAt(n); auto src_global_discr = src_variant_col.globalDiscriminatorAt(n);
auto src_offset = src_variant_col.offsetAt(n); auto src_offset = src_variant_col.offsetAt(n);
@ -386,16 +388,15 @@ void ColumnDynamic::doInsertRangeFrom(const IColumn & src_, size_t start, size_t
"[start({}) + length({}) > src.size()({})]", start, length, src_.size()); "[start({}) + length({}) > src.size()({})]", start, length, src_.size());
const auto & dynamic_src = assert_cast<const ColumnDynamic &>(src_); const auto & dynamic_src = assert_cast<const ColumnDynamic &>(src_);
auto & variant_col = getVariantColumn();
/// Check if we have the same variants in both columns. /// Check if we have the same variants in both columns.
if (variant_info.variant_names == dynamic_src.variant_info.variant_names) if (variant_info.variant_names == dynamic_src.variant_info.variant_names)
{ {
variant_column->insertRangeFrom(*dynamic_src.variant_column, start, length); variant_col.insertRangeFrom(*dynamic_src.variant_column, start, length);
return; return;
} }
auto & variant_col = assert_cast<ColumnVariant &>(*variant_column);
/// If variants are different, we need to extend our variant with new variants. /// If variants are different, we need to extend our variant with new variants.
if (auto * global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) if (auto * global_discriminators_mapping = combineVariants(dynamic_src.variant_info))
{ {
@ -602,15 +603,15 @@ void ColumnDynamic::doInsertManyFrom(const IColumn & src_, size_t position, size
#endif #endif
{ {
const auto & dynamic_src = assert_cast<const ColumnDynamic &>(src_); const auto & dynamic_src = assert_cast<const ColumnDynamic &>(src_);
auto & variant_col = getVariantColumn();
/// Check if we have the same variants in both columns. /// Check if we have the same variants in both columns.
if (variant_info.variant_names == dynamic_src.variant_info.variant_names) if (variant_info.variant_names == dynamic_src.variant_info.variant_names)
{ {
variant_column->insertManyFrom(*dynamic_src.variant_column, position, length); variant_col.insertManyFrom(*dynamic_src.variant_column, position, length);
return; return;
} }
auto & variant_col = assert_cast<ColumnVariant &>(*variant_column);
const auto & src_variant_col = assert_cast<const ColumnVariant &>(*dynamic_src.variant_column); const auto & src_variant_col = assert_cast<const ColumnVariant &>(*dynamic_src.variant_column);
auto src_global_discr = src_variant_col.globalDiscriminatorAt(position); auto src_global_discr = src_variant_col.globalDiscriminatorAt(position);
auto src_offset = src_variant_col.offsetAt(position); auto src_offset = src_variant_col.offsetAt(position);
@ -751,7 +752,7 @@ StringRef ColumnDynamic::serializeValueIntoArena(size_t n, Arena & arena, const
const char * ColumnDynamic::deserializeAndInsertFromArena(const char * pos) const char * ColumnDynamic::deserializeAndInsertFromArena(const char * pos)
{ {
auto & variant_col = assert_cast<ColumnVariant &>(*variant_column); auto & variant_col = getVariantColumn();
UInt8 null_bit = unalignedLoad<UInt8>(pos); UInt8 null_bit = unalignedLoad<UInt8>(pos);
pos += sizeof(UInt8); pos += sizeof(UInt8);
if (null_bit) if (null_bit)
@ -808,7 +809,7 @@ const char * ColumnDynamic::skipSerializedInArena(const char * pos) const
void ColumnDynamic::updateHashWithValue(size_t n, SipHash & hash) const void ColumnDynamic::updateHashWithValue(size_t n, SipHash & hash) const
{ {
const auto & variant_col = assert_cast<const ColumnVariant &>(*variant_column); const auto & variant_col = getVariantColumn();
auto discr = variant_col.globalDiscriminatorAt(n); auto discr = variant_col.globalDiscriminatorAt(n);
if (discr == ColumnVariant::NULL_DISCRIMINATOR) if (discr == ColumnVariant::NULL_DISCRIMINATOR)
{ {
@ -826,9 +827,9 @@ int ColumnDynamic::compareAt(size_t n, size_t m, const IColumn & rhs, int nan_di
int ColumnDynamic::doCompareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const int ColumnDynamic::doCompareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const
#endif #endif
{ {
const auto & left_variant = assert_cast<const ColumnVariant &>(*variant_column); const auto & left_variant = getVariantColumn();
const auto & right_dynamic = assert_cast<const ColumnDynamic &>(rhs); const auto & right_dynamic = assert_cast<const ColumnDynamic &>(rhs);
const auto & right_variant = assert_cast<const ColumnVariant &>(*right_dynamic.variant_column); const auto & right_variant = right_dynamic.getVariantColumn();
auto left_discr = left_variant.globalDiscriminatorAt(n); auto left_discr = left_variant.globalDiscriminatorAt(n);
auto left_shared_variant_discr = getSharedVariantDiscriminator(); auto left_shared_variant_discr = getSharedVariantDiscriminator();
@ -970,7 +971,7 @@ void ColumnDynamic::updatePermutation(IColumn::PermutationSortDirection directio
ColumnPtr ColumnDynamic::compress() const ColumnPtr ColumnDynamic::compress() const
{ {
ColumnPtr variant_compressed = variant_column->compress(); ColumnPtr variant_compressed = variant_column_ptr->compress();
size_t byte_size = variant_compressed->byteSize(); size_t byte_size = variant_compressed->byteSize();
return ColumnCompressed::create(size(), byte_size, return ColumnCompressed::create(size(), byte_size,
[my_variant_compressed = std::move(variant_compressed), my_variant_info = variant_info, my_max_dynamic_types = max_dynamic_types, my_global_max_dynamic_types = global_max_dynamic_types, my_statistics = statistics]() mutable [my_variant_compressed = std::move(variant_compressed), my_variant_info = variant_info, my_max_dynamic_types = max_dynamic_types, my_global_max_dynamic_types = global_max_dynamic_types, my_statistics = statistics]() mutable
@ -998,7 +999,18 @@ void ColumnDynamic::prepareForSquashing(const Columns & source_columns)
variant_col.getLocalDiscriminators().reserve_exact(new_size); variant_col.getLocalDiscriminators().reserve_exact(new_size);
variant_col.getOffsets().reserve_exact(new_size); variant_col.getOffsets().reserve_exact(new_size);
/// Second, collect all variants and their total sizes. /// Second, preallocate memory for variants.
prepareVariantsForSquashing(source_columns);
}
void ColumnDynamic::prepareVariantsForSquashing(const Columns & source_columns)
{
/// Internal variants of source dynamic columns may differ.
/// We want to preallocate memory for all variants we will have after squashing.
/// It may happen that the total number of variants in source columns will
/// exceed the limit, in this case we will choose the most frequent variants.
/// Collect all variants and their total sizes.
std::unordered_map<String, size_t> total_variant_sizes; std::unordered_map<String, size_t> total_variant_sizes;
DataTypes all_variants; DataTypes all_variants;
@ -1072,6 +1084,7 @@ void ColumnDynamic::prepareForSquashing(const Columns & source_columns)
/// Now current dynamic column has all resulting variants and we can call /// Now current dynamic column has all resulting variants and we can call
/// prepareForSquashing on them to preallocate the memory. /// prepareForSquashing on them to preallocate the memory.
auto & variant_col = getVariantColumn();
for (size_t i = 0; i != variant_info.variant_names.size(); ++i) for (size_t i = 0; i != variant_info.variant_names.size(); ++i)
{ {
Columns source_variant_columns; Columns source_variant_columns;
@ -1240,12 +1253,12 @@ void ColumnDynamic::takeDynamicStructureFromSourceColumns(const Columns & source
void ColumnDynamic::applyNullMap(const ColumnVector<UInt8>::Container & null_map) void ColumnDynamic::applyNullMap(const ColumnVector<UInt8>::Container & null_map)
{ {
assert_cast<ColumnVariant &>(*variant_column).applyNullMap(null_map); variant_column_ptr->applyNullMap(null_map);
} }
void ColumnDynamic::applyNegatedNullMap(const ColumnVector<UInt8>::Container & null_map) void ColumnDynamic::applyNegatedNullMap(const ColumnVector<UInt8>::Container & null_map)
{ {
assert_cast<ColumnVariant &>(*variant_column).applyNegatedNullMap(null_map); variant_column_ptr->applyNegatedNullMap(null_map);
} }
} }

View File

@ -106,7 +106,7 @@ public:
return create(variant_column_->assumeMutable(), variant_type, max_dynamic_types_, global_max_dynamic_types_, statistics_); return create(variant_column_->assumeMutable(), variant_type, max_dynamic_types_, global_max_dynamic_types_, statistics_);
} }
static MutablePtr create(size_t max_dynamic_types_) static MutablePtr create(size_t max_dynamic_types_ = MAX_DYNAMIC_TYPES_LIMIT)
{ {
return Base::create(max_dynamic_types_); return Base::create(max_dynamic_types_);
} }
@ -136,7 +136,7 @@ public:
size_t size() const override size_t size() const override
{ {
return variant_column->size(); return variant_column_ptr->size();
} }
Field operator[](size_t n) const override; Field operator[](size_t n) const override;
@ -145,22 +145,22 @@ public:
bool isDefaultAt(size_t n) const override bool isDefaultAt(size_t n) const override
{ {
return variant_column->isDefaultAt(n); return variant_column_ptr->isDefaultAt(n);
} }
bool isNullAt(size_t n) const override bool isNullAt(size_t n) const override
{ {
return variant_column->isNullAt(n); return variant_column_ptr->isNullAt(n);
} }
StringRef getDataAt(size_t n) const override StringRef getDataAt(size_t n) const override
{ {
return variant_column->getDataAt(n); return variant_column_ptr->getDataAt(n);
} }
void insertData(const char * pos, size_t length) override void insertData(const char * pos, size_t length) override
{ {
variant_column->insertData(pos, length); variant_column_ptr->insertData(pos, length);
} }
void insert(const Field & x) override; void insert(const Field & x) override;
@ -178,17 +178,17 @@ public:
void insertDefault() override void insertDefault() override
{ {
variant_column->insertDefault(); variant_column_ptr->insertDefault();
} }
void insertManyDefaults(size_t length) override void insertManyDefaults(size_t length) override
{ {
variant_column->insertManyDefaults(length); variant_column_ptr->insertManyDefaults(length);
} }
void popBack(size_t n) override void popBack(size_t n) override
{ {
variant_column->popBack(n); variant_column_ptr->popBack(n);
} }
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
@ -199,42 +199,42 @@ public:
WeakHash32 getWeakHash32() const override WeakHash32 getWeakHash32() const override
{ {
return variant_column->getWeakHash32(); return variant_column_ptr->getWeakHash32();
} }
void updateHashFast(SipHash & hash) const override void updateHashFast(SipHash & hash) const override
{ {
variant_column->updateHashFast(hash); variant_column_ptr->updateHashFast(hash);
} }
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override
{ {
return create(variant_column->filter(filt, result_size_hint), variant_info, max_dynamic_types, global_max_dynamic_types); return create(variant_column_ptr->filter(filt, result_size_hint), variant_info, max_dynamic_types, global_max_dynamic_types);
} }
void expand(const Filter & mask, bool inverted) override void expand(const Filter & mask, bool inverted) override
{ {
variant_column->expand(mask, inverted); variant_column_ptr->expand(mask, inverted);
} }
ColumnPtr permute(const Permutation & perm, size_t limit) const override ColumnPtr permute(const Permutation & perm, size_t limit) const override
{ {
return create(variant_column->permute(perm, limit), variant_info, max_dynamic_types, global_max_dynamic_types); return create(variant_column_ptr->permute(perm, limit), variant_info, max_dynamic_types, global_max_dynamic_types);
} }
ColumnPtr index(const IColumn & indexes, size_t limit) const override ColumnPtr index(const IColumn & indexes, size_t limit) const override
{ {
return create(variant_column->index(indexes, limit), variant_info, max_dynamic_types, global_max_dynamic_types); return create(variant_column_ptr->index(indexes, limit), variant_info, max_dynamic_types, global_max_dynamic_types);
} }
ColumnPtr replicate(const Offsets & replicate_offsets) const override ColumnPtr replicate(const Offsets & replicate_offsets) const override
{ {
return create(variant_column->replicate(replicate_offsets), variant_info, max_dynamic_types, global_max_dynamic_types); return create(variant_column_ptr->replicate(replicate_offsets), variant_info, max_dynamic_types, global_max_dynamic_types);
} }
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override
{ {
MutableColumns scattered_variant_columns = variant_column->scatter(num_columns, selector); MutableColumns scattered_variant_columns = variant_column_ptr->scatter(num_columns, selector);
MutableColumns scattered_columns; MutableColumns scattered_columns;
scattered_columns.reserve(num_columns); scattered_columns.reserve(num_columns);
for (auto & scattered_variant_column : scattered_variant_columns) for (auto & scattered_variant_column : scattered_variant_columns)
@ -251,12 +251,12 @@ public:
bool hasEqualValues() const override bool hasEqualValues() const override
{ {
return variant_column->hasEqualValues(); return variant_column_ptr->hasEqualValues();
} }
void getExtremes(Field & min, Field & max) const override void getExtremes(Field & min, Field & max) const override
{ {
variant_column->getExtremes(min, max); variant_column_ptr->getExtremes(min, max);
} }
void getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, void getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
@ -267,44 +267,53 @@ public:
void reserve(size_t n) override void reserve(size_t n) override
{ {
variant_column->reserve(n); variant_column_ptr->reserve(n);
}
size_t capacity() const override
{
return variant_column_ptr->capacity();
} }
void prepareForSquashing(const Columns & source_columns) override; void prepareForSquashing(const Columns & source_columns) override;
/// Prepare only variants but not discriminators and offsets.
void prepareVariantsForSquashing(const Columns & source_columns);
void ensureOwnership() override void ensureOwnership() override
{ {
variant_column->ensureOwnership(); variant_column_ptr->ensureOwnership();
} }
size_t byteSize() const override size_t byteSize() const override
{ {
return variant_column->byteSize(); return variant_column_ptr->byteSize();
} }
size_t byteSizeAt(size_t n) const override size_t byteSizeAt(size_t n) const override
{ {
return variant_column->byteSizeAt(n); return variant_column_ptr->byteSizeAt(n);
} }
size_t allocatedBytes() const override size_t allocatedBytes() const override
{ {
return variant_column->allocatedBytes(); return variant_column_ptr->allocatedBytes();
} }
void protect() override void protect() override
{ {
variant_column->protect(); variant_column_ptr->protect();
} }
void forEachSubcolumn(MutableColumnCallback callback) override void forEachSubcolumn(MutableColumnCallback callback) override
{ {
callback(variant_column); callback(variant_column);
variant_column_ptr = assert_cast<ColumnVariant *>(variant_column.get());
} }
void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override
{ {
callback(*variant_column); callback(*variant_column);
variant_column_ptr = assert_cast<ColumnVariant *>(variant_column.get());
variant_column->forEachSubcolumnRecursively(callback); variant_column->forEachSubcolumnRecursively(callback);
} }
@ -319,27 +328,27 @@ public:
double getRatioOfDefaultRows(double sample_ratio) const override double getRatioOfDefaultRows(double sample_ratio) const override
{ {
return variant_column->getRatioOfDefaultRows(sample_ratio); return variant_column_ptr->getRatioOfDefaultRows(sample_ratio);
} }
UInt64 getNumberOfDefaultRows() const override UInt64 getNumberOfDefaultRows() const override
{ {
return variant_column->getNumberOfDefaultRows(); return variant_column_ptr->getNumberOfDefaultRows();
} }
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{ {
variant_column->getIndicesOfNonDefaultRows(indices, from, limit); variant_column_ptr->getIndicesOfNonDefaultRows(indices, from, limit);
} }
void finalize() override void finalize() override
{ {
variant_column->finalize(); variant_column_ptr->finalize();
} }
bool isFinalized() const override bool isFinalized() const override
{ {
return variant_column->isFinalized(); return variant_column_ptr->isFinalized();
} }
/// Apply null map to a nested Variant column. /// Apply null map to a nested Variant column.
@ -351,8 +360,8 @@ public:
const ColumnPtr & getVariantColumnPtr() const { return variant_column; } const ColumnPtr & getVariantColumnPtr() const { return variant_column; }
ColumnPtr & getVariantColumnPtr() { return variant_column; } ColumnPtr & getVariantColumnPtr() { return variant_column; }
const ColumnVariant & getVariantColumn() const { return assert_cast<const ColumnVariant &>(*variant_column); } const ColumnVariant & getVariantColumn() const { return *variant_column_ptr; }
ColumnVariant & getVariantColumn() { return assert_cast<ColumnVariant &>(*variant_column); } ColumnVariant & getVariantColumn() { return *variant_column_ptr; }
bool addNewVariant(const DataTypePtr & new_variant, const String & new_variant_name); bool addNewVariant(const DataTypePtr & new_variant, const String & new_variant_name);
bool addNewVariant(const DataTypePtr & new_variant) { return addNewVariant(new_variant, new_variant->getName()); } bool addNewVariant(const DataTypePtr & new_variant) { return addNewVariant(new_variant, new_variant->getName()); }
@ -420,6 +429,7 @@ public:
} }
const SerializationPtr & getVariantSerialization(const DataTypePtr & variant_type) const { return getVariantSerialization(variant_type, variant_type->getName()); } const SerializationPtr & getVariantSerialization(const DataTypePtr & variant_type) const { return getVariantSerialization(variant_type, variant_type->getName()); }
private: private:
void createVariantInfo(const DataTypePtr & variant_type); void createVariantInfo(const DataTypePtr & variant_type);
@ -432,6 +442,10 @@ private:
void updateVariantInfoAndExpandVariantColumn(const DataTypePtr & new_variant_type); void updateVariantInfoAndExpandVariantColumn(const DataTypePtr & new_variant_type);
WrappedPtr variant_column; WrappedPtr variant_column;
/// Store and use pointer to ColumnVariant to avoid virtual calls.
/// ColumnDynamic is widely used inside ColumnObject for each path and
/// with hundreds of paths these virtual calls are noticeable.
ColumnVariant * variant_column_ptr;
/// Store the type of current variant with some additional information. /// Store the type of current variant with some additional information.
VariantInfo variant_info; VariantInfo variant_info;
/// The maximum number of different types that can be stored in this Dynamic column. /// The maximum number of different types that can be stored in this Dynamic column.

View File

@ -182,6 +182,11 @@ public:
chars.reserve_exact(n * size); chars.reserve_exact(n * size);
} }
size_t capacity() const override
{
return chars.capacity() / n;
}
void shrinkToFit() override void shrinkToFit() override
{ {
chars.shrink_to_fit(); chars.shrink_to_fit();

View File

@ -172,6 +172,7 @@ public:
} }
void reserve(size_t n) override { idx.reserve(n); } void reserve(size_t n) override { idx.reserve(n); }
size_t capacity() const override { return idx.capacity(); }
void shrinkToFit() override { idx.shrinkToFit(); } void shrinkToFit() override { idx.shrinkToFit(); }
/// Don't count the dictionary size as it can be shared between different blocks. /// Don't count the dictionary size as it can be shared between different blocks.
@ -309,6 +310,7 @@ public:
void popBack(size_t n) { positions->popBack(n); } void popBack(size_t n) { positions->popBack(n); }
void reserve(size_t n) { positions->reserve(n); } void reserve(size_t n) { positions->reserve(n); }
size_t capacity() const { return positions->capacity(); }
void shrinkToFit() { positions->shrinkToFit(); } void shrinkToFit() { positions->shrinkToFit(); }
UInt64 getMaxPositionForCurrentType() const; UInt64 getMaxPositionForCurrentType() const;

View File

@ -249,6 +249,11 @@ void ColumnMap::reserve(size_t n)
nested->reserve(n); nested->reserve(n);
} }
size_t ColumnMap::capacity() const
{
return nested->capacity();
}
void ColumnMap::prepareForSquashing(const Columns & source_columns) void ColumnMap::prepareForSquashing(const Columns & source_columns)
{ {
Columns nested_source_columns; Columns nested_source_columns;

View File

@ -94,6 +94,7 @@ public:
void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override; size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override;
void reserve(size_t n) override; void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override; void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override; void shrinkToFit() override;
void ensureOwnership() override; void ensureOwnership() override;

View File

@ -706,6 +706,11 @@ void ColumnNullable::reserve(size_t n)
getNullMapData().reserve(n); getNullMapData().reserve(n);
} }
size_t ColumnNullable::capacity() const
{
return getNullMapData().capacity();
}
void ColumnNullable::prepareForSquashing(const Columns & source_columns) void ColumnNullable::prepareForSquashing(const Columns & source_columns)
{ {
size_t new_size = size(); size_t new_size = size();

View File

@ -125,6 +125,7 @@ public:
size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_ranges) const override; size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_ranges) const override;
size_t estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const override; size_t estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const override;
void reserve(size_t n) override; void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override; void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override; void shrinkToFit() override;
void ensureOwnership() override; void ensureOwnership() override;

File diff suppressed because it is too large Load Diff

View File

@ -1,216 +1,117 @@
#pragma once #pragma once
#include <Columns/IColumn.h> #include <Columns/IColumn.h>
#include <Core/Field.h> #include <Columns/ColumnVector.h>
#include <Core/Names.h> #include <Columns/ColumnArray.h>
#include <DataTypes/Serializations/SubcolumnsTree.h> #include <Columns/ColumnTuple.h>
#include <Common/PODArray.h> #include <Columns/ColumnString.h>
#include <Common/WeakHash.h>
#include <DataTypes/IDataType.h> #include <DataTypes/IDataType.h>
#include <DataTypes/Serializations/SerializationDynamic.h>
#include <Formats/FormatSettings.h>
#include <Common/StringHashForHeterogeneousLookup.h>
#include <Common/WeakHash.h>
namespace DB namespace DB
{ {
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
/// Info that represents a scalar or array field in a decomposed view.
/// It allows to recreate field with different number
/// of dimensions or nullability.
struct FieldInfo
{
/// The common type of of all scalars in field.
DataTypePtr scalar_type;
/// Do we have NULL scalar in field.
bool have_nulls;
/// If true then we have scalars with different types in array and
/// we need to convert scalars to the common type.
bool need_convert;
/// Number of dimension in array. 0 if field is scalar.
size_t num_dimensions;
/// If true then this field is an array of variadic dimension field
/// and we need to normalize the dimension
bool need_fold_dimension;
};
FieldInfo getFieldInfo(const Field & field);
/** A column that represents object with dynamic set of subcolumns.
* Subcolumns are identified by paths in document and are stored in
* a trie-like structure. ColumnObject is not suitable for writing into tables
* and it should be converted to Tuple with fixed set of subcolumns before that.
*/
class ColumnObject final : public COWHelper<IColumnHelper<ColumnObject>, ColumnObject> class ColumnObject final : public COWHelper<IColumnHelper<ColumnObject>, ColumnObject>
{ {
public: public:
/** Class that represents one subcolumn. struct Statistics
* It stores values in several parts of column
* and keeps current common type of all parts.
* We add a new column part with a new type, when we insert a field,
* which can't be converted to the current common type.
* After insertion of all values subcolumn should be finalized
* for writing and other operations.
*/
class Subcolumn
{ {
public: enum class Source
Subcolumn() = default;
Subcolumn(size_t size_, bool is_nullable_);
Subcolumn(MutableColumnPtr && data_, bool is_nullable_);
size_t size() const;
size_t byteSize() const;
size_t allocatedBytes() const;
void get(size_t n, Field & res) const;
bool isFinalized() const;
const DataTypePtr & getLeastCommonType() const { return least_common_type.get(); }
const DataTypePtr & getLeastCommonTypeBase() const { return least_common_type.getBase(); }
size_t getNumberOfDimensions() const { return least_common_type.getNumberOfDimensions(); }
/// Checks the consistency of column's parts stored in @data.
void checkTypes() const;
/// Inserts a field, which scalars can be arbitrary, but number of
/// dimensions should be consistent with current common type.
void insert(Field field);
void insert(Field field, FieldInfo info);
void insertDefault();
void insertManyDefaults(size_t length);
void insertRangeFrom(const Subcolumn & src, size_t start, size_t length);
void popBack(size_t n);
Subcolumn cut(size_t start, size_t length) const;
/// Converts all column's parts to the common type and
/// creates a single column that stores all values.
void finalize();
/// Returns last inserted field.
Field getLastField() const;
FieldInfo getFieldInfo() const;
/// Recreates subcolumn with default scalar values and keeps sizes of arrays.
/// Used to create columns of type Nested with consistent array sizes.
Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const;
/// Returns single column if subcolumn in finalizes.
/// Otherwise -- undefined behaviour.
IColumn & getFinalizedColumn();
const IColumn & getFinalizedColumn() const;
const ColumnPtr & getFinalizedColumnPtr() const;
const std::vector<WrappedPtr> & getData() const { return data; }
size_t getNumberOfDefaultsInPrefix() const { return num_of_defaults_in_prefix; }
friend class ColumnObject;
private:
class LeastCommonType
{ {
public: READ, /// Statistics were loaded into column during reading from MergeTree.
LeastCommonType(); MERGE, /// Statistics were calculated during merge of several MergeTree parts.
explicit LeastCommonType(DataTypePtr type_);
const DataTypePtr & get() const { return type; }
const DataTypePtr & getBase() const { return base_type; }
size_t getNumberOfDimensions() const { return num_dimensions; }
private:
DataTypePtr type;
DataTypePtr base_type;
size_t num_dimensions = 0;
}; };
void addNewColumnPart(DataTypePtr type); explicit Statistics(Source source_) : source(source_) {}
/// Current least common type of all values inserted to this subcolumn. /// Source of the statistics.
LeastCommonType least_common_type; Source source;
/// Statistics for dynamic paths: (path) -> (total number of not-null values).
/// If true then common type type of subcolumn is Nullable std::unordered_map<String, size_t> dynamic_paths_statistics;
/// and default values are NULLs. /// Statistics for paths in shared data: path) -> (total number of not-null values).
bool is_nullable = false; /// We don't store statistics for all paths in shared data but only for some subset of them
/// (is 10000 a good limit? It should not be expensive to store 10000 paths per part)
/// Parts of column. Parts should be in increasing order in terms of subtypes/supertypes. static const size_t MAX_SHARED_DATA_STATISTICS_SIZE = 10000;
/// That means that the least common type for i-th prefix is the type of i-th part std::unordered_map<String, size_t, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal> shared_data_paths_statistics;
/// and it's the supertype for all type of column from 0 to i-1.
std::vector<WrappedPtr> data;
/// Until we insert any non-default field we don't know further
/// least common type and we count number of defaults in prefix,
/// which will be converted to the default type of final common type.
size_t num_of_defaults_in_prefix = 0;
size_t num_rows = 0;
}; };
using Subcolumns = SubcolumnsTree<Subcolumn>; using StatisticsPtr = std::shared_ptr<const Statistics>;
private: private:
/// If true then all subcolumns are nullable. friend class COWHelper<IColumnHelper<ColumnObject>, ColumnObject>;
const bool is_nullable;
Subcolumns subcolumns; ColumnObject(std::unordered_map<String, MutableColumnPtr> typed_paths_, size_t max_dynamic_paths_, size_t max_dynamic_types_);
size_t num_rows; ColumnObject(
std::unordered_map<String, MutableColumnPtr> typed_paths_,
std::unordered_map<String, MutableColumnPtr> dynamic_paths_,
MutableColumnPtr shared_data_,
size_t max_dynamic_paths_,
size_t global_max_dynamic_paths_,
size_t max_dynamic_types_,
const StatisticsPtr & statistics_ = {});
/// Use StringHashForHeterogeneousLookup hash for hash maps to be able to use std::string_view in find() method.
using PathToColumnMap = std::unordered_map<String, WrappedPtr, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal>;
using PathToDynamicColumnPtrMap = std::unordered_map<String, ColumnDynamic *, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal>;
public: public:
static constexpr auto COLUMN_NAME_DUMMY = "_dummy"; /** Create immutable column using immutable arguments. This arguments may be shared with other columns.
* Use mutate in order to make mutable column and mutate shared nested columns.
*/
using Base = COWHelper<IColumnHelper<ColumnObject>, ColumnObject>;
explicit ColumnObject(bool is_nullable_); static Ptr create(
ColumnObject(Subcolumns && subcolumns_, bool is_nullable_); const std::unordered_map<String, ColumnPtr> & typed_paths_,
const std::unordered_map<String, ColumnPtr> & dynamic_paths_,
const ColumnPtr & shared_data_,
size_t max_dynamic_paths_,
size_t global_max_dynamic_paths_,
size_t max_dynamic_types_,
const StatisticsPtr & statistics_ = {});
/// Checks that all subcolumns have consistent sizes. static MutablePtr create(
void checkConsistency() const; std::unordered_map<String, MutableColumnPtr> typed_paths_,
std::unordered_map<String, MutableColumnPtr> dynamic_paths_,
MutableColumnPtr shared_data_,
size_t max_dynamic_paths_,
size_t global_max_dynamic_paths_,
size_t max_dynamic_types_,
const StatisticsPtr & statistics_ = {});
bool hasSubcolumn(const PathInData & key) const; static MutablePtr create(std::unordered_map<String, MutableColumnPtr> typed_paths_, size_t max_dynamic_paths_, size_t max_dynamic_types_);
const Subcolumn & getSubcolumn(const PathInData & key) const; std::string getName() const override;
Subcolumn & getSubcolumn(const PathInData & key);
void incrementNumRows() { ++num_rows; } const char * getFamilyName() const override
{
return "Object";
}
/// Adds a subcolumn from existing IColumn. TypeIndex getDataType() const override
void addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn); {
return TypeIndex::Object;
}
/// Adds a subcolumn of specific size with default values. MutableColumnPtr cloneEmpty() const override;
void addSubcolumn(const PathInData & key, size_t new_size); MutableColumnPtr cloneResized(size_t size) const override;
/// Adds a subcolumn of type Nested of specific size with default values. size_t size() const override
/// It cares about consistency of sizes of Nested arrays. {
void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size); return shared_data->size();
}
/// Finds a subcolumn from the same Nested type as @entry and inserts Field operator[](size_t n) const override;
/// an array with default values with consistent sizes as in Nested type. void get(size_t n, Field & res) const override;
bool tryInsertDefaultFromNested(const Subcolumns::NodePtr & entry) const;
bool tryInsertManyDefaultsFromNested(const Subcolumns::NodePtr & entry) const;
const Subcolumns & getSubcolumns() const { return subcolumns; } bool isDefaultAt(size_t n) const override;
Subcolumns & getSubcolumns() { return subcolumns; } StringRef getDataAt(size_t n) const override;
PathsInData getKeys() const; void insertData(const char * pos, size_t length) override;
/// Part of interface
const char * getFamilyName() const override { return "Object"; }
TypeIndex getDataType() const override { return TypeIndex::Object; }
size_t size() const override;
size_t byteSize() const override;
size_t allocatedBytes() const override;
void forEachSubcolumn(MutableColumnCallback callback) override;
void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override;
void insert(const Field & field) override;
bool tryInsert(const Field & field) override;
void insertDefault() override;
void insert(const Field & x) override;
bool tryInsert(const Field & x) override;
#if !defined(DEBUG_OR_SANITIZER_BUILD) #if !defined(DEBUG_OR_SANITIZER_BUILD)
void insertFrom(const IColumn & src, size_t n) override; void insertFrom(const IColumn & src, size_t n) override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
@ -218,24 +119,31 @@ public:
void doInsertFrom(const IColumn & src, size_t n) override; void doInsertFrom(const IColumn & src, size_t n) override;
void doInsertRangeFrom(const IColumn & src, size_t start, size_t length) override; void doInsertRangeFrom(const IColumn & src, size_t start, size_t length) override;
#endif #endif
/// TODO: implement more optimal insertManyFrom
void insertDefault() override;
void insertManyDefaults(size_t length) override;
void popBack(size_t length) override; void popBack(size_t n) override;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override; ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr filter(const Filter & filter, ssize_t result_size_hint) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override; ColumnPtr index(const IColumn & indexes, size_t limit) const override;
ColumnPtr replicate(const Offsets & offsets) const override; ColumnPtr replicate(const Offsets & replicate_offsets) const override;
MutableColumnPtr cloneResized(size_t new_size) const override; MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override;
/// Finalizes all subcolumns. void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &) const override;
void finalize() override;
bool isFinalized() const override;
/// Order of rows in ColumnObject is undefined.
void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation & res) const override;
void updatePermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &, EqualRanges &) const override {} void updatePermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &, EqualRanges &) const override {}
/// Values of ColumnObject are not comparable.
#if !defined(DEBUG_OR_SANITIZER_BUILD) #if !defined(DEBUG_OR_SANITIZER_BUILD)
int compareAt(size_t, size_t, const IColumn &, int) const override { return 0; } int compareAt(size_t, size_t, const IColumn &, int) const override { return 0; }
#else #else
@ -243,35 +151,118 @@ public:
#endif #endif
void getExtremes(Field & min, Field & max) const override; void getExtremes(Field & min, Field & max) const override;
/// All other methods throw exception. void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const std::vector<ColumnPtr> & source_columns) override;
void ensureOwnership() override;
size_t byteSize() const override;
size_t byteSizeAt(size_t n) const override;
size_t allocatedBytes() const override;
void protect() override;
StringRef getDataAt(size_t) const override { throwMustBeConcrete(); } void forEachSubcolumn(MutableColumnCallback callback) override;
bool isDefaultAt(size_t) const override { throwMustBeConcrete(); }
void insertData(const char *, size_t) override { throwMustBeConcrete(); }
StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeConcrete(); }
char * serializeValueIntoMemory(size_t, char *) const override { throwMustBeConcrete(); }
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeConcrete(); }
const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); }
WeakHash32 getWeakHash32() const override { throwMustBeConcrete(); }
void updateHashFast(SipHash & hash) const override;
void expand(const Filter &, bool) override { throwMustBeConcrete(); }
bool hasEqualValues() const override { throwMustBeConcrete(); }
size_t byteSizeAt(size_t) const override { throwMustBeConcrete(); }
double getRatioOfDefaultRows(double) const override { throwMustBeConcrete(); }
UInt64 getNumberOfDefaultRows() const override { throwMustBeConcrete(); }
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeConcrete(); }
private: void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override;
[[noreturn]] static void throwMustBeConcrete()
bool structureEquals(const IColumn & rhs) const override;
ColumnPtr compress() const override;
void finalize() override;
bool isFinalized() const override;
bool hasDynamicStructure() const override { return true; }
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
const PathToColumnMap & getTypedPaths() const { return typed_paths; }
PathToColumnMap & getTypedPaths() { return typed_paths; }
const PathToColumnMap & getDynamicPaths() const { return dynamic_paths; }
PathToColumnMap & getDynamicPaths() { return dynamic_paths; }
const PathToDynamicColumnPtrMap & getDynamicPathsPtrs() const { return dynamic_paths_ptrs; }
PathToDynamicColumnPtrMap & getDynamicPathsPtrs() { return dynamic_paths_ptrs; }
const StatisticsPtr & getStatistics() const { return statistics; }
const ColumnPtr & getSharedDataPtr() const { return shared_data; }
ColumnPtr & getSharedDataPtr() { return shared_data; }
IColumn & getSharedDataColumn() { return *shared_data; }
const ColumnArray & getSharedDataNestedColumn() const { return assert_cast<const ColumnArray &>(*shared_data); }
ColumnArray & getSharedDataNestedColumn() { return assert_cast<ColumnArray &>(*shared_data); }
ColumnArray::Offsets & getSharedDataOffsets() { return assert_cast<ColumnArray &>(*shared_data).getOffsets(); }
const ColumnArray::Offsets & getSharedDataOffsets() const { return assert_cast<const ColumnArray &>(*shared_data).getOffsets(); }
std::pair<ColumnString *, ColumnString *> getSharedDataPathsAndValues()
{ {
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ColumnObject must be converted to ColumnTuple before use"); auto & column_array = assert_cast<ColumnArray &>(*shared_data);
auto & column_tuple = assert_cast<ColumnTuple &>(column_array.getData());
return {assert_cast<ColumnString *>(&column_tuple.getColumn(0)), assert_cast<ColumnString *>(&column_tuple.getColumn(1))};
} }
template <typename Func> std::pair<const ColumnString *, const ColumnString *> getSharedDataPathsAndValues() const
MutableColumnPtr applyForSubcolumns(Func && func) const; {
const auto & column_array = assert_cast<const ColumnArray &>(*shared_data);
const auto & column_tuple = assert_cast<const ColumnTuple &>(column_array.getData());
return {assert_cast<const ColumnString *>(&column_tuple.getColumn(0)), assert_cast<const ColumnString *>(&column_tuple.getColumn(1))};
}
/// It's used to get shared sized of Nested to insert correct default values. size_t getMaxDynamicTypes() const { return max_dynamic_types; }
const Subcolumns::Node * getLeafOfTheSameNested(const Subcolumns::NodePtr & entry) const; size_t getMaxDynamicPaths() const { return max_dynamic_paths; }
size_t getGlobalMaxDynamicPaths() const { return global_max_dynamic_paths; }
/// Try to add new dynamic path. Returns pointer to the new dynamic
/// path column or nullptr if limit on dynamic paths is reached.
ColumnDynamic * tryToAddNewDynamicPath(std::string_view path);
/// Throws an exception if cannot add.
void addNewDynamicPath(std::string_view path);
void setDynamicPaths(const std::vector<String> & paths);
void setMaxDynamicPaths(size_t max_dynamic_paths_);
void setStatistics(const StatisticsPtr & statistics_) { statistics = statistics_; }
void serializePathAndValueIntoSharedData(ColumnString * shared_data_paths, ColumnString * shared_data_values, std::string_view path, const IColumn & column, size_t n);
void deserializeValueFromSharedData(const ColumnString * shared_data_values, size_t n, IColumn & column) const;
/// Paths in shared data are sorted in each row. Use this method to find the lower bound for specific path in the row.
static size_t findPathLowerBoundInSharedData(StringRef path, const ColumnString & shared_data_paths, size_t start, size_t end);
/// Insert all the data from shared data with specified path to dynamic column.
static void fillPathColumnFromSharedData(IColumn & path_column, StringRef path, const ColumnPtr & shared_data_column, size_t start, size_t end);
private:
void insertFromSharedDataAndFillRemainingDynamicPaths(const ColumnObject & src_object_column, std::vector<std::string_view> && src_dynamic_paths_for_shared_data, size_t start, size_t length);
void serializePathAndValueIntoArena(Arena & arena, const char *& begin, StringRef path, StringRef value, StringRef & res) const;
/// Map path -> column for paths with explicitly specified types.
/// This set of paths is constant and cannot be changed.
PathToColumnMap typed_paths;
/// Map path -> column for dynamically added paths. All columns
/// here are Dynamic columns. This set of paths can be extended
/// during inerts into the column.
PathToColumnMap dynamic_paths;
/// Store and use pointers to ColumnDynamic to avoid virtual calls.
/// With hundreds of dynamic paths these virtual calls are noticeable.
PathToDynamicColumnPtrMap dynamic_paths_ptrs;
/// Shared storage for all other paths and values. It's filled
/// when the number of dynamic paths reaches the limit.
/// It has type Array(Tuple(String, String)) and stores
/// an array of pairs (path, binary serialized dynamic value) for each row.
WrappedPtr shared_data;
/// Maximum number of dynamic paths. If this limit is reached, all new paths will be inserted into shared data.
/// This limit can be different for different instances of Object column. For example, we can decrease it
/// in takeDynamicStructureFromSourceColumns before merge.
size_t max_dynamic_paths;
/// Global limit on number of dynamic paths for all column instances of this Object type. It's the limit specified
/// in the type definition (for example 'JSON(max_dynamic_paths=N)'). max_dynamic_paths is always not greater than this limit.
size_t global_max_dynamic_paths;
/// Maximum number of dynamic types for each dynamic path. Used while creating Dynamic columns for new dynamic paths.
size_t max_dynamic_types;
/// Statistics on the number of non-null values for each dynamic path and for some shared data paths in the MergeTree data part.
/// Calculated during serializing of data part in MergeTree. Used to determine the set of dynamic paths for the merged part.
StatisticsPtr statistics;
}; };
} }

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,275 @@
#pragma once
#include <Columns/IColumn.h>
#include <Core/Field.h>
#include <Core/Names.h>
#include <DataTypes/Serializations/SubcolumnsTree.h>
#include <Common/PODArray.h>
#include <Common/WeakHash.h>
#include <DataTypes/IDataType.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
/// Info that represents a scalar or array field in a decomposed view.
/// It allows to recreate field with different number
/// of dimensions or nullability.
struct FieldInfo
{
/// The common type of of all scalars in field.
DataTypePtr scalar_type;
/// Do we have NULL scalar in field.
bool have_nulls;
/// If true then we have scalars with different types in array and
/// we need to convert scalars to the common type.
bool need_convert;
/// Number of dimension in array. 0 if field is scalar.
size_t num_dimensions;
/// If true then this field is an array of variadic dimension field
/// and we need to normalize the dimension
bool need_fold_dimension;
};
FieldInfo getFieldInfo(const Field & field);
/** A column that represents object with dynamic set of subcolumns.
* Subcolumns are identified by paths in document and are stored in
* a trie-like structure. ColumnObjectDeprecated is not suitable for writing into tables
* and it should be converted to Tuple with fixed set of subcolumns before that.
*/
class ColumnObjectDeprecated final : public COWHelper<IColumnHelper<ColumnObjectDeprecated>, ColumnObjectDeprecated>
{
public:
/** Class that represents one subcolumn.
* It stores values in several parts of column
* and keeps current common type of all parts.
* We add a new column part with a new type, when we insert a field,
* which can't be converted to the current common type.
* After insertion of all values subcolumn should be finalized
* for writing and other operations.
*/
class Subcolumn
{
public:
Subcolumn() = default;
Subcolumn(size_t size_, bool is_nullable_);
Subcolumn(MutableColumnPtr && data_, bool is_nullable_);
size_t size() const;
size_t byteSize() const;
size_t allocatedBytes() const;
void get(size_t n, Field & res) const;
bool isFinalized() const;
const DataTypePtr & getLeastCommonType() const { return least_common_type.get(); }
const DataTypePtr & getLeastCommonTypeBase() const { return least_common_type.getBase(); }
size_t getNumberOfDimensions() const { return least_common_type.getNumberOfDimensions(); }
/// Checks the consistency of column's parts stored in @data.
void checkTypes() const;
/// Inserts a field, which scalars can be arbitrary, but number of
/// dimensions should be consistent with current common type.
void insert(Field field);
void insert(Field field, FieldInfo info);
void insertDefault();
void insertManyDefaults(size_t length);
void insertRangeFrom(const Subcolumn & src, size_t start, size_t length);
void popBack(size_t n);
Subcolumn cut(size_t start, size_t length) const;
/// Converts all column's parts to the common type and
/// creates a single column that stores all values.
void finalize();
/// Returns last inserted field.
Field getLastField() const;
FieldInfo getFieldInfo() const;
/// Recreates subcolumn with default scalar values and keeps sizes of arrays.
/// Used to create columns of type Nested with consistent array sizes.
Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const;
/// Returns single column if subcolumn in finalizes.
/// Otherwise -- undefined behaviour.
IColumn & getFinalizedColumn();
const IColumn & getFinalizedColumn() const;
const ColumnPtr & getFinalizedColumnPtr() const;
const std::vector<WrappedPtr> & getData() const { return data; }
size_t getNumberOfDefaultsInPrefix() const { return num_of_defaults_in_prefix; }
friend class ColumnObjectDeprecated;
private:
class LeastCommonType
{
public:
LeastCommonType();
explicit LeastCommonType(DataTypePtr type_);
const DataTypePtr & get() const { return type; }
const DataTypePtr & getBase() const { return base_type; }
size_t getNumberOfDimensions() const { return num_dimensions; }
private:
DataTypePtr type;
DataTypePtr base_type;
size_t num_dimensions = 0;
};
void addNewColumnPart(DataTypePtr type);
/// Current least common type of all values inserted to this subcolumn.
LeastCommonType least_common_type;
/// If true then common type type of subcolumn is Nullable
/// and default values are NULLs.
bool is_nullable = false;
/// Parts of column. Parts should be in increasing order in terms of subtypes/supertypes.
/// That means that the least common type for i-th prefix is the type of i-th part
/// and it's the supertype for all type of column from 0 to i-1.
std::vector<WrappedPtr> data;
/// Until we insert any non-default field we don't know further
/// least common type and we count number of defaults in prefix,
/// which will be converted to the default type of final common type.
size_t num_of_defaults_in_prefix = 0;
size_t num_rows = 0;
};
using Subcolumns = SubcolumnsTree<Subcolumn>;
private:
/// If true then all subcolumns are nullable.
const bool is_nullable;
Subcolumns subcolumns;
size_t num_rows;
public:
static constexpr auto COLUMN_NAME_DUMMY = "_dummy";
explicit ColumnObjectDeprecated(bool is_nullable_);
ColumnObjectDeprecated(Subcolumns && subcolumns_, bool is_nullable_);
/// Checks that all subcolumns have consistent sizes.
void checkConsistency() const;
bool hasSubcolumn(const PathInData & key) const;
const Subcolumn & getSubcolumn(const PathInData & key) const;
Subcolumn & getSubcolumn(const PathInData & key);
void incrementNumRows() { ++num_rows; }
/// Adds a subcolumn from existing IColumn.
void addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn);
/// Adds a subcolumn of specific size with default values.
void addSubcolumn(const PathInData & key, size_t new_size);
/// Adds a subcolumn of type Nested of specific size with default values.
/// It cares about consistency of sizes of Nested arrays.
void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size);
/// Finds a subcolumn from the same Nested type as @entry and inserts
/// an array with default values with consistent sizes as in Nested type.
bool tryInsertDefaultFromNested(const Subcolumns::NodePtr & entry) const;
bool tryInsertManyDefaultsFromNested(const Subcolumns::NodePtr & entry) const;
const Subcolumns & getSubcolumns() const { return subcolumns; }
Subcolumns & getSubcolumns() { return subcolumns; }
PathsInData getKeys() const;
/// Part of interface
const char * getFamilyName() const override { return "Object"; }
TypeIndex getDataType() const override { return TypeIndex::ObjectDeprecated; }
size_t size() const override;
size_t byteSize() const override;
size_t allocatedBytes() const override;
void forEachSubcolumn(MutableColumnCallback callback) override;
void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override;
void insert(const Field & field) override;
bool tryInsert(const Field & field) override;
void insertDefault() override;
#if !defined(DEBUG_OR_SANITIZER_BUILD)
void insertFrom(const IColumn & src, size_t n) override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
#else
void doInsertFrom(const IColumn & src, size_t n) override;
void doInsertRangeFrom(const IColumn & src, size_t start, size_t length) override;
#endif
void popBack(size_t length) override;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr filter(const Filter & filter, ssize_t result_size_hint) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
ColumnPtr replicate(const Offsets & offsets) const override;
MutableColumnPtr cloneResized(size_t new_size) const override;
/// Finalizes all subcolumns.
void finalize() override;
bool isFinalized() const override;
/// Order of rows in ColumnObjectDeprecated is undefined.
void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation & res) const override;
void updatePermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &, EqualRanges &) const override {}
#if !defined(DEBUG_OR_SANITIZER_BUILD)
int compareAt(size_t, size_t, const IColumn &, int) const override { return 0; }
#else
int doCompareAt(size_t, size_t, const IColumn &, int) const override { return 0; }
#endif
void getExtremes(Field & min, Field & max) const override;
/// All other methods throw exception.
StringRef getDataAt(size_t) const override { throwMustBeConcrete(); }
bool isDefaultAt(size_t) const override { throwMustBeConcrete(); }
void insertData(const char *, size_t) override { throwMustBeConcrete(); }
StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeConcrete(); }
char * serializeValueIntoMemory(size_t, char *) const override { throwMustBeConcrete(); }
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeConcrete(); }
const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); }
WeakHash32 getWeakHash32() const override { throwMustBeConcrete(); }
void updateHashFast(SipHash &) const override;
void expand(const Filter &, bool) override { throwMustBeConcrete(); }
bool hasEqualValues() const override { throwMustBeConcrete(); }
size_t byteSizeAt(size_t) const override { throwMustBeConcrete(); }
double getRatioOfDefaultRows(double) const override { throwMustBeConcrete(); }
UInt64 getNumberOfDefaultRows() const override { throwMustBeConcrete(); }
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeConcrete(); }
private:
[[noreturn]] static void throwMustBeConcrete()
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ColumnObjectDeprecated must be converted to ColumnTuple before use");
}
template <typename Func>
MutableColumnPtr applyForSubcolumns(Func && func) const;
/// It's used to get shared sized of Nested to insert correct default values.
const Subcolumns::Node * getLeafOfTheSameNested(const Subcolumns::NodePtr & entry) const;
};
}

View File

@ -557,6 +557,11 @@ void ColumnString::reserve(size_t n)
offsets.reserve_exact(n); offsets.reserve_exact(n);
} }
size_t ColumnString::capacity() const
{
return offsets.capacity();
}
void ColumnString::prepareForSquashing(const Columns & source_columns) void ColumnString::prepareForSquashing(const Columns & source_columns)
{ {
size_t new_size = size(); size_t new_size = size();

View File

@ -283,6 +283,7 @@ public:
ColumnPtr compress() const override; ColumnPtr compress() const override;
void reserve(size_t n) override; void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override; void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override; void shrinkToFit() override;

View File

@ -595,6 +595,14 @@ void ColumnTuple::reserve(size_t n)
getColumn(i).reserve(n); getColumn(i).reserve(n);
} }
size_t ColumnTuple::capacity() const
{
if (columns.empty())
return size();
return getColumn(0).capacity();
}
void ColumnTuple::prepareForSquashing(const Columns & source_columns) void ColumnTuple::prepareForSquashing(const Columns & source_columns)
{ {
const size_t tuple_size = columns.size(); const size_t tuple_size = columns.size();

View File

@ -110,6 +110,7 @@ public:
void updatePermutationWithCollation(const Collator & collator, IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, void updatePermutationWithCollation(const Collator & collator, IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges& equal_ranges) const override; size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges& equal_ranges) const override;
void reserve(size_t n) override; void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override; void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override; void shrinkToFit() override;
void ensureOwnership() override; void ensureOwnership() override;

View File

@ -1277,6 +1277,11 @@ void ColumnVariant::prepareForSquashing(const Columns & source_columns)
} }
} }
size_t ColumnVariant::capacity() const
{
return local_discriminators->capacity();
}
void ColumnVariant::ensureOwnership() void ColumnVariant::ensureOwnership()
{ {
const size_t num_variants = variants.size(); const size_t num_variants = variants.size();

View File

@ -241,6 +241,7 @@ public:
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override; size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override;
void reserve(size_t n) override; void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override; void prepareForSquashing(const Columns & source_columns) override;
void ensureOwnership() override; void ensureOwnership() override;
size_t byteSize() const override; size_t byteSize() const override;

View File

@ -180,6 +180,11 @@ public:
data.reserve_exact(n); data.reserve_exact(n);
} }
size_t capacity() const override
{
return data.capacity();
}
void shrinkToFit() override void shrinkToFit() override
{ {
data.shrink_to_fit(); data.shrink_to_fit();

View File

@ -11,12 +11,13 @@
#include <Columns/ColumnLowCardinality.h> #include <Columns/ColumnLowCardinality.h>
#include <Columns/ColumnMap.h> #include <Columns/ColumnMap.h>
#include <Columns/ColumnNullable.h> #include <Columns/ColumnNullable.h>
#include <Columns/ColumnObject.h> #include <Columns/ColumnObjectDeprecated.h>
#include <Columns/ColumnSparse.h> #include <Columns/ColumnSparse.h>
#include <Columns/ColumnString.h> #include <Columns/ColumnString.h>
#include <Columns/ColumnTuple.h> #include <Columns/ColumnTuple.h>
#include <Columns/ColumnVariant.h> #include <Columns/ColumnVariant.h>
#include <Columns/ColumnDynamic.h> #include <Columns/ColumnDynamic.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnVector.h> #include <Columns/ColumnVector.h>
#include <Core/Field.h> #include <Core/Field.h>
#include <DataTypes/Serializations/SerializationInfo.h> #include <DataTypes/Serializations/SerializationInfo.h>
@ -466,12 +467,13 @@ template class IColumnHelper<ColumnArray, IColumn>;
template class IColumnHelper<ColumnTuple, IColumn>; template class IColumnHelper<ColumnTuple, IColumn>;
template class IColumnHelper<ColumnMap, IColumn>; template class IColumnHelper<ColumnMap, IColumn>;
template class IColumnHelper<ColumnSparse, IColumn>; template class IColumnHelper<ColumnSparse, IColumn>;
template class IColumnHelper<ColumnObject, IColumn>; template class IColumnHelper<ColumnObjectDeprecated, IColumn>;
template class IColumnHelper<ColumnAggregateFunction, IColumn>; template class IColumnHelper<ColumnAggregateFunction, IColumn>;
template class IColumnHelper<ColumnFunction, IColumn>; template class IColumnHelper<ColumnFunction, IColumn>;
template class IColumnHelper<ColumnCompressed, IColumn>; template class IColumnHelper<ColumnCompressed, IColumn>;
template class IColumnHelper<ColumnVariant, IColumn>; template class IColumnHelper<ColumnVariant, IColumn>;
template class IColumnHelper<ColumnDynamic, IColumn>; template class IColumnHelper<ColumnDynamic, IColumn>;
template class IColumnHelper<ColumnObject, IColumn>;
template class IColumnHelper<IColumnDummy, IColumn>; template class IColumnHelper<IColumnDummy, IColumn>;

View File

@ -475,6 +475,9 @@ public:
/// It affects performance only (not correctness). /// It affects performance only (not correctness).
virtual void reserve(size_t /*n*/) {} virtual void reserve(size_t /*n*/) {}
/// Returns the number of elements allocated in reserve.
virtual size_t capacity() const { return size(); }
/// Reserve memory before squashing all specified source columns into this column. /// Reserve memory before squashing all specified source columns into this column.
virtual void prepareForSquashing(const std::vector<Ptr> & source_columns) virtual void prepareForSquashing(const std::vector<Ptr> & source_columns)
{ {

View File

@ -0,0 +1,351 @@
#include <Columns/ColumnString.h>
#include <Columns/ColumnObject.h>
#include <DataTypes/DataTypeFactory.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/WriteBufferFromString.h>
#include <Common/Arena.h>
#include <gtest/gtest.h>
using namespace DB;
TEST(ColumnObject, CreateEmpty)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=20, a.b UInt32, a.c Array(String))");
auto col = type->createColumn();
const auto & col_object = assert_cast<const ColumnObject &>(*col);
const auto & typed_paths = col_object.getTypedPaths();
ASSERT_TRUE(typed_paths.contains("a.b"));
ASSERT_EQ(typed_paths.at("a.b")->getName(), "UInt32");
ASSERT_TRUE(typed_paths.contains("a.c"));
ASSERT_EQ(typed_paths.at("a.c")->getName(), "Array(String)");
ASSERT_TRUE(col_object.getDynamicPaths().empty());
ASSERT_TRUE(col_object.getSharedDataOffsets().empty());
ASSERT_TRUE(col_object.getSharedDataPathsAndValues().first->empty());
ASSERT_TRUE(col_object.getSharedDataPathsAndValues().second->empty());
ASSERT_EQ(col_object.getMaxDynamicTypes(), 10);
ASSERT_EQ(col_object.getMaxDynamicPaths(), 20);
}
TEST(ColumnObject, GetName)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=20, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
ASSERT_EQ(col->getName(), "Object(max_dynamic_paths=20, max_dynamic_types=10, a.b Array(String), b.d UInt32)");
}
Field deserializeFieldFromSharedData(ColumnString * values, size_t n)
{
auto data = values->getDataAt(n);
ReadBufferFromMemory buf(data.data, data.size);
Field res;
std::make_shared<SerializationDynamic>()->deserializeBinary(res, buf, FormatSettings());
return res;
}
TEST(ColumnObject, InsertField)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
const auto & typed_paths = col_object.getTypedPaths();
const auto & dynamic_paths = col_object.getDynamicPaths();
const auto & shared_data_nested_column = col_object.getSharedDataNestedColumn();
const auto & shared_data_offsets = col_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = col_object.getSharedDataPathsAndValues();
Object empty_object;
col_object.insert(empty_object);
ASSERT_EQ(col_object[0], (Object{{"a.b", Array{}}, {"b.d", Field(0u)}}));
ASSERT_EQ(typed_paths.at("a.b")->size(), 1);
ASSERT_TRUE(typed_paths.at("a.b")->isDefaultAt(0));
ASSERT_EQ(typed_paths.at("b.d")->size(), 1);
ASSERT_TRUE(typed_paths.at("b.d")->isDefaultAt(0));
ASSERT_TRUE(dynamic_paths.empty());
ASSERT_EQ(shared_data_nested_column.size(), 1);
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(0));
Object object1 = {{"a.b", Array{String("Hello"), String("World")}}, {"a.c", Field(42)}};
col_object.insert(object1);
ASSERT_EQ(col_object[1], (Object{{"a.b", Array{String("Hello"), String("World")}}, {"b.d", Field(0u)}, {"a.c", Field(42)}}));
ASSERT_EQ(typed_paths.at("a.b")->size(), 2);
ASSERT_EQ((*typed_paths.at("a.b"))[1], (Array{String("Hello"), String("World")}));
ASSERT_EQ(typed_paths.at("b.d")->size(), 2);
ASSERT_TRUE(typed_paths.at("b.d")->isDefaultAt(1));
ASSERT_EQ(dynamic_paths.size(), 1);
ASSERT_TRUE(dynamic_paths.contains("a.c"));
ASSERT_EQ(dynamic_paths.at("a.c")->size(), 2);
ASSERT_TRUE(dynamic_paths.at("a.c")->isDefaultAt(0));
ASSERT_EQ((*dynamic_paths.at("a.c"))[1], Field(42));
ASSERT_EQ(shared_data_nested_column.size(), 2);
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(1));
Object object2 = {{"b.d", Field(142u)}, {"a.c", Field(43)}, {"a.d", Field("str")}, {"a.e", Field(242)}, {"a.f", Array{Field(42), Field(43)}}};
col_object.insert(object2);
ASSERT_EQ(col_object[2], (Object{{"a.b", Array{}}, {"b.d", Field(142u)}, {"a.c", Field(43)}, {"a.d", Field("str")}, {"a.e", Field(242)}, {"a.f", Array{Field(42), Field(43)}}}));
ASSERT_EQ(typed_paths.at("a.b")->size(), 3);
ASSERT_TRUE(typed_paths.at("a.b")->isDefaultAt(2));
ASSERT_EQ(typed_paths.at("b.d")->size(), 3);
ASSERT_EQ((*typed_paths.at("b.d"))[2], Field(142u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_TRUE(dynamic_paths.contains("a.c"));
ASSERT_EQ(dynamic_paths.at("a.c")->size(), 3);
ASSERT_EQ((*dynamic_paths.at("a.c"))[2], Field(43));
ASSERT_TRUE(dynamic_paths.contains("a.d"));
ASSERT_EQ(dynamic_paths.at("a.d")->size(), 3);
ASSERT_EQ((*dynamic_paths.at("a.d"))[2], Field("str"));
ASSERT_EQ(shared_data_nested_column.size(), 3);
ASSERT_EQ(shared_data_offsets[2] - shared_data_offsets[1], 2);
ASSERT_EQ((*shared_data_paths)[0], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 0), Field(242));
ASSERT_EQ((*shared_data_paths)[1], "a.f");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 1), (Array({Field(42), Field(43)})));
Object object3 = {{"b.a", Field("Str")}, {"b.b", Field(2)}, {"b.c", Field(Tuple{Field(42), Field("Str")})}};
col_object.insert(object3);
ASSERT_EQ(col_object[3], (Object{{"a.b", Array{}}, {"b.d", Field(0u)}, {"b.a", Field("Str")}, {"b.b", Field(2)}, {"b.c", Field(Tuple{Field(42), Field("Str")})}}));
ASSERT_EQ(typed_paths.at("a.b")->size(), 4);
ASSERT_TRUE(typed_paths.at("a.b")->isDefaultAt(3));
ASSERT_EQ(typed_paths.at("b.d")->size(), 4);
ASSERT_TRUE(typed_paths.at("b.d")->isDefaultAt(3));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ(dynamic_paths.at("a.c")->size(), 4);
ASSERT_TRUE(dynamic_paths.at("a.c")->isDefaultAt(3));
ASSERT_EQ(dynamic_paths.at("a.d")->size(), 4);
ASSERT_TRUE(dynamic_paths.at("a.d")->isDefaultAt(3));
ASSERT_EQ(shared_data_nested_column.size(), 4);
ASSERT_EQ(shared_data_offsets[3] - shared_data_offsets[2], 3);
ASSERT_EQ((*shared_data_paths)[2], "b.a");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 2), Field("Str"));
ASSERT_EQ((*shared_data_paths)[3], "b.b");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 3), Field(2));
ASSERT_EQ((*shared_data_paths)[4], "b.c");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 4), Field(Tuple{Field(42), Field("Str")}));
Object object4 = {{"c.c", Field(Null())}, {"c.d", Field(Null())}};
col_object.insert(object4);
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(4));
}
TEST(ColumnObject, InsertFrom)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"a.a", Field(42)}});
const auto & typed_paths = col_object.getTypedPaths();
const auto & dynamic_paths = col_object.getDynamicPaths();
const auto & shared_data_nested_column = col_object.getSharedDataNestedColumn();
const auto & shared_data_offsets = col_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = col_object.getSharedDataPathsAndValues();
auto src_col1 = type->createColumn();
auto & src_col_object1 = assert_cast<ColumnObject &>(*src_col1);
src_col_object1.insert(Object{{"b.d", Field(43u)}, {"a.c", Field("Str1")}});
col_object.insertFrom(src_col_object1, 0);
ASSERT_EQ((*typed_paths.at("a.b"))[1], Field(Array{}));
ASSERT_EQ((*typed_paths.at("b.d"))[1], Field(43u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[1], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[1], Field("Str1"));
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(1));
auto src_col2 = type->createColumn();
auto & src_col_object2 = assert_cast<ColumnObject &>(*src_col2);
src_col_object2.insert(Object{{"a.b", Array{"Str4", "Str5"}}, {"b.d", Field(44u)}, {"a.d", Field("Str2")}, {"a.e", Field("Str3")}});
col_object.insertFrom(src_col_object2, 0);
ASSERT_EQ((*typed_paths.at("a.b"))[2], Field(Array{"Str4", "Str5"}));
ASSERT_EQ((*typed_paths.at("b.d"))[2], Field(44u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[2], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[2], Field(Null()));
ASSERT_EQ(shared_data_offsets[2] - shared_data_offsets[1], 2);
ASSERT_EQ((*shared_data_paths)[0], "a.d");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 0), Field("Str2"));
ASSERT_EQ((*shared_data_paths)[1], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 1), Field("Str3"));
auto src_col3 = type->createColumn();
auto & src_col_object3 = assert_cast<ColumnObject &>(*src_col3);
src_col_object3.insert(Object{{"a.h", Field("Str6")}, {"h.h", Field("Str7")}});
src_col_object3.insert(Object{{"a.a", Field("Str10")}, {"a.c", Field(45u)}, {"a.h", Field("Str6")}, {"h.h", Field("Str7")}, {"a.f", Field("Str8")}, {"a.g", Field("Str9")}, {"a.i", Field("Str11")}, {"a.u", Field(Null())}});
col_object.insertFrom(src_col_object3, 1);
ASSERT_EQ((*typed_paths.at("a.b"))[3], Field(Array{}));
ASSERT_EQ((*typed_paths.at("b.d"))[3], Field(0u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[3], Field("Str10"));
ASSERT_EQ((*dynamic_paths.at("a.c"))[3], Field(45u));
ASSERT_EQ(shared_data_offsets[3] - shared_data_offsets[2], 5);
ASSERT_EQ((*shared_data_paths)[2], "a.f");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 2), Field("Str8"));
ASSERT_EQ((*shared_data_paths)[3], "a.g");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 3), Field("Str9"));
ASSERT_EQ((*shared_data_paths)[4], "a.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 4), Field("Str6"));
ASSERT_EQ((*shared_data_paths)[5], "a.i");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 5), Field("Str11"));
ASSERT_EQ((*shared_data_paths)[6], "h.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 6), Field("Str7"));
}
TEST(ColumnObject, InsertRangeFrom)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"a.a", Field(42)}});
const auto & typed_paths = col_object.getTypedPaths();
const auto & dynamic_paths = col_object.getDynamicPaths();
const auto & shared_data_nested_column = col_object.getSharedDataNestedColumn();
const auto & shared_data_offsets = col_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = col_object.getSharedDataPathsAndValues();
auto src_col1 = type->createColumn();
auto & src_col_object1 = assert_cast<ColumnObject &>(*src_col1);
src_col_object1.insert(Object{{"b.d", Field(43u)}, {"a.c", Field("Str1")}});
src_col_object1.insert(Object{{"a.b", Field(Array{"Str1", "Str2"})}, {"a.a", Field("Str1")}});
src_col_object1.insert(Object{{"b.d", Field(45u)}, {"a.c", Field("Str2")}});
col_object.insertRangeFrom(src_col_object1, 0, 3);
ASSERT_EQ((*typed_paths.at("a.b"))[1], Field(Array{}));
ASSERT_EQ((*typed_paths.at("a.b"))[2], Field(Array{"Str1", "Str2"}));
ASSERT_EQ((*typed_paths.at("a.b"))[3], Field(Array{}));
ASSERT_EQ((*typed_paths.at("b.d"))[1], Field(43u));
ASSERT_EQ((*typed_paths.at("b.d"))[2], Field(0u));
ASSERT_EQ((*typed_paths.at("b.d"))[3], Field(45u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[1], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.a"))[2], Field("Str1"));
ASSERT_EQ((*dynamic_paths.at("a.a"))[3], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[1], Field("Str1"));
ASSERT_EQ((*dynamic_paths.at("a.c"))[2], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[3], Field("Str2"));
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(1));
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(2));
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(3));
auto src_col2 = type->createColumn();
auto & src_col_object2 = assert_cast<ColumnObject &>(*src_col2);
src_col_object2.insert(Object{{"a.b", Array{"Str4", "Str5"}}, {"a.d", Field("Str2")}, {"a.e", Field("Str3")}});
src_col_object2.insert(Object{{"b.d", Field(44u)}, {"a.d", Field("Str22")}, {"a.e", Field("Str33")}});
src_col_object2.insert(Object{{"a.b", Array{"Str44", "Str55"}}, {"a.d", Field("Str222")}, {"a.e", Field("Str333")}});
col_object.insertRangeFrom(src_col_object2, 0, 3);
ASSERT_EQ((*typed_paths.at("a.b"))[4], Field(Array{"Str4", "Str5"}));
ASSERT_EQ((*typed_paths.at("a.b"))[5], Field(Array{}));
ASSERT_EQ((*typed_paths.at("a.b"))[6], Field(Array{"Str44", "Str55"}));
ASSERT_EQ((*typed_paths.at("b.d"))[4], Field(0u));
ASSERT_EQ((*typed_paths.at("b.d"))[5], Field(44u));
ASSERT_EQ((*typed_paths.at("b.d"))[6], Field(0u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[4], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.a"))[5], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.a"))[6], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[4], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[5], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[6], Field(Null()));
ASSERT_EQ(shared_data_offsets[4] - shared_data_offsets[3], 2);
ASSERT_EQ((*shared_data_paths)[0], "a.d");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 0), Field("Str2"));
ASSERT_EQ((*shared_data_paths)[1], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 1), Field("Str3"));
ASSERT_EQ(shared_data_offsets[5] - shared_data_offsets[4], 2);
ASSERT_EQ((*shared_data_paths)[2], "a.d");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 2), Field("Str22"));
ASSERT_EQ((*shared_data_paths)[3], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 3), Field("Str33"));
ASSERT_EQ(shared_data_offsets[6] - shared_data_offsets[5], 2);
ASSERT_EQ((*shared_data_paths)[4], "a.d");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 4), Field("Str222"));
ASSERT_EQ((*shared_data_paths)[5], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 5), Field("Str333"));
auto src_col3 = type->createColumn();
auto & src_col_object3 = assert_cast<ColumnObject &>(*src_col3);
src_col_object3.insert(Object{{"a.h", Field("Str6")}, {"h.h", Field("Str7")}});
src_col_object3.insert(Object{{"a.h", Field("Str6")}, {"h.h", Field("Str7")}, {"a.f", Field("Str8")}, {"a.g", Field("Str9")}, {"a.i", Field("Str11")}});
src_col_object3.insert(Object{{"a.a", Field("Str10")}});
src_col_object3.insert(Object{{"a.h", Field("Str6")}, {"a.c", Field(45u)}, {"h.h", Field("Str7")}, {"a.i", Field("Str11")}});
col_object.insertRangeFrom(src_col_object3, 1, 3);
ASSERT_EQ((*typed_paths.at("a.b"))[7], Field(Array{}));
ASSERT_EQ((*typed_paths.at("a.b"))[8], Field(Array{}));
ASSERT_EQ((*typed_paths.at("a.b"))[9], Field(Array{}));
ASSERT_EQ((*typed_paths.at("b.d"))[7], Field(0u));
ASSERT_EQ((*typed_paths.at("b.d"))[8], Field(0u));
ASSERT_EQ((*typed_paths.at("b.d"))[9], Field(0u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[7], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.a"))[8], Field("Str10"));
ASSERT_EQ((*dynamic_paths.at("a.a"))[9], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[7], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[8], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[9], Field(45u));
ASSERT_EQ(shared_data_offsets[7] - shared_data_offsets[6], 5);
ASSERT_EQ((*shared_data_paths)[6], "a.f");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 6), Field("Str8"));
ASSERT_EQ((*shared_data_paths)[7], "a.g");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 7), Field("Str9"));
ASSERT_EQ((*shared_data_paths)[8], "a.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 8), Field("Str6"));
ASSERT_EQ((*shared_data_paths)[9], "a.i");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 9), Field("Str11"));
ASSERT_EQ((*shared_data_paths)[10], "h.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 10), Field("Str7"));
ASSERT_EQ(shared_data_offsets[8] - shared_data_offsets[7], 0);
ASSERT_EQ(shared_data_offsets[9] - shared_data_offsets[8], 3);
ASSERT_EQ((*shared_data_paths)[11], "a.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 11), Field("Str6"));
ASSERT_EQ((*shared_data_paths)[12], "a.i");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 12), Field("Str11"));
}
TEST(ColumnObject, SerializeDeserializerFromArena)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"b.d", Field(42u)}, {"a.b", Array{"Str1", "Str2"}}, {"a.a", Tuple{"Str3", 441u}}, {"a.c", Field("Str4")}, {"a.d", Array{Field(45), Field(46)}}, {"a.e", Field(47)}});
col_object.insert(Object{{"b.a", Field(48)}, {"b.b", Array{Field(49), Field(50)}}});
col_object.insert(Object{{"b.d", Field(442u)}, {"a.b", Array{"Str11", "Str22"}}, {"a.a", Tuple{"Str33", 444u}}, {"a.c", Field("Str44")}, {"a.d", Array{Field(445), Field(446)}}, {"a.e", Field(447)}});
Arena arena;
const char * pos = nullptr;
auto ref1 = col_object.serializeValueIntoArena(0, arena, pos);
col_object.serializeValueIntoArena(1, arena, pos);
col_object.serializeValueIntoArena(2, arena, pos);
auto col2 = type->createColumn();
auto & col_object2 = assert_cast<ColumnObject &>(*col);
pos = col_object2.deserializeAndInsertFromArena(ref1.data);
pos = col_object2.deserializeAndInsertFromArena(pos);
col_object2.deserializeAndInsertFromArena(pos);
ASSERT_EQ(col_object2[0], (Object{{"b.d", Field(42u)}, {"a.b", Array{"Str1", "Str2"}}, {"a.a", Tuple{"Str3", 441u}}, {"a.c", Field("Str4")}, {"a.d", Array{Field(45), Field(46)}}, {"a.e", Field(47)}}));
ASSERT_EQ(col_object2[1], (Object{{"b.d", Field{0u}}, {"a.b", Array{}}, {"b.a", Field(48)}, {"b.b", Array{Field(49), Field(50)}}}));
ASSERT_EQ(col_object2[2], (Object{{"b.d", Field(442u)}, {"a.b", Array{"Str11", "Str22"}}, {"a.a", Tuple{"Str33", 444u}}, {"a.c", Field("Str44")}, {"a.d", Array{Field(445), Field(446)}}, {"a.e", Field(447)}}));
}
TEST(ColumnObject, SkipSerializedInArena)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"b.d", Field(42u)}, {"a.b", Array{"Str1", "Str2"}}, {"a.a", Tuple{"Str3", 441u}}, {"a.c", Field("Str4")}, {"a.d", Array{Field(45), Field(46)}}, {"a.e", Field(47)}});
col_object.insert(Object{{"b.a", Field(48)}, {"b.b", Array{Field(49), Field(50)}}});
col_object.insert(Object{{"b.d", Field(442u)}, {"a.b", Array{"Str11", "Str22"}}, {"a.a", Tuple{"Str33", 444u}}, {"a.c", Field("Str44")}, {"a.d", Array{Field(445), Field(446)}}, {"a.e", Field(447)}});
Arena arena;
const char * pos = nullptr;
auto ref1 = col_object.serializeValueIntoArena(0, arena, pos);
col_object.serializeValueIntoArena(1, arena, pos);
auto ref3 = col_object.serializeValueIntoArena(2, arena, pos);
const char * end = ref3.data + ref3.size;
auto col2 = type->createColumn();
pos = col2->skipSerializedInArena(ref1.data);
pos = col2->skipSerializedInArena(pos);
pos = col2->skipSerializedInArena(pos);
ASSERT_EQ(pos, end);
}

View File

@ -0,0 +1,30 @@
#pragma once
#include <base/StringRef.h>
namespace DB
{
/// See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0919r3.html
struct StringHashForHeterogeneousLookup
{
using hash_type = std::hash<std::string_view>;
using transparent_key_equal = std::equal_to<>;
using is_transparent = void; // required to make find() work with different type than key_type
auto operator()(const std::string_view view) const
{
return hash_type()(view);
}
auto operator()(const std::string & str) const
{
return hash_type()(str);
}
auto operator()(const char * data) const
{
return hash_type()(data);
}
};
}

View File

@ -878,6 +878,7 @@ class IColumn;
M(Bool, allow_get_client_http_header, false, "Allow to use the function `getClientHTTPHeader` which lets to obtain a value of an the current HTTP request's header. It is not enabled by default for security reasons, because some headers, such as `Cookie`, could contain sensitive info. Note that the `X-ClickHouse-*` and `Authentication` headers are always restricted and cannot be obtained with this function.", 0) \ M(Bool, allow_get_client_http_header, false, "Allow to use the function `getClientHTTPHeader` which lets to obtain a value of an the current HTTP request's header. It is not enabled by default for security reasons, because some headers, such as `Cookie`, could contain sensitive info. Note that the `X-ClickHouse-*` and `Authentication` headers are always restricted and cannot be obtained with this function.", 0) \
M(Bool, cast_string_to_dynamic_use_inference, false, "Use types inference during String to Dynamic conversion", 0) \ M(Bool, cast_string_to_dynamic_use_inference, false, "Use types inference during String to Dynamic conversion", 0) \
M(Bool, enable_blob_storage_log, true, "Write information about blob storage operations to system.blob_storage_log table", 0) \ M(Bool, enable_blob_storage_log, true, "Write information about blob storage operations to system.blob_storage_log table", 0) \
M(Bool, use_json_alias_for_old_object_type, false, "When enabled, JSON type alias will create old experimental Object type instead of a new JSON type", 0) \
M(Bool, allow_create_index_without_type, false, "Allow CREATE INDEX query without TYPE. Query will be ignored. Made for SQL compatibility tests.", 0) \ M(Bool, allow_create_index_without_type, false, "Allow CREATE INDEX query without TYPE. Query will be ignored. Made for SQL compatibility tests.", 0) \
M(Bool, create_index_ignore_unique, false, "Ignore UNIQUE keyword in CREATE UNIQUE INDEX. Made for SQL compatibility tests.", 0) \ M(Bool, create_index_ignore_unique, false, "Ignore UNIQUE keyword in CREATE UNIQUE INDEX. Made for SQL compatibility tests.", 0) \
M(Bool, print_pretty_type_names, true, "Print pretty type names in DESCRIBE query and toTypeName() function", 0) \ M(Bool, print_pretty_type_names, true, "Print pretty type names in DESCRIBE query and toTypeName() function", 0) \
@ -911,6 +912,7 @@ class IColumn;
M(Bool, allow_experimental_vector_similarity_index, false, "Allow experimental vector similarity index", 0) \ M(Bool, allow_experimental_vector_similarity_index, false, "Allow experimental vector similarity index", 0) \
M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \ M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \
M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \ M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \
M(Bool, allow_experimental_json_type, false, "Allow JSON data type", 0) \
M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \ M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \
M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \ M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \
M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \
@ -1132,6 +1134,7 @@ class IColumn;
M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \ M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \ M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \
M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \ M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \
M(Bool, type_json_skip_duplicated_paths, false, "When enabled, during parsing JSON object into JSON type duplicated paths will be ignored and only the first one will be inserted instead of an exception", 0) \
M(UInt64, input_format_json_max_depth, 1000, "Maximum depth of a field in JSON. This is not a strict limit, it does not have to be applied precisely.", 0) \ M(UInt64, input_format_json_max_depth, 1000, "Maximum depth of a field in JSON. This is not a strict limit, it does not have to be applied precisely.", 0) \
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \

View File

@ -87,6 +87,9 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"allow_experimental_time_series_table", false, false, "Added new setting to allow the TimeSeries table engine"}, {"allow_experimental_time_series_table", false, false, "Added new setting to allow the TimeSeries table engine"},
{"enable_analyzer", 1, 1, "Added an alias to a setting `allow_experimental_analyzer`."}, {"enable_analyzer", 1, 1, "Added an alias to a setting `allow_experimental_analyzer`."},
{"optimize_functions_to_subcolumns", false, true, "Enabled settings by default"}, {"optimize_functions_to_subcolumns", false, true, "Enabled settings by default"},
{"allow_experimental_json_type", false, false, "Add new experimental JSON type"},
{"use_json_alias_for_old_object_type", true, false, "Use JSON type alias to create new JSON type"},
{"type_json_skip_duplicated_paths", false, false, "Allow to skip duplicated paths during JSON parsing"},
{"allow_experimental_vector_similarity_index", false, false, "Added new setting to allow experimental vector similarity indexes"}, {"allow_experimental_vector_similarity_index", false, false, "Added new setting to allow experimental vector similarity indexes"},
{"input_format_try_infer_datetimes_only_datetime64", true, false, "Allow to infer DateTime instead of DateTime64 in data formats"} {"input_format_try_infer_datetimes_only_datetime64", true, false, "Allow to infer DateTime instead of DateTime64 in data formats"}
} }

View File

@ -45,6 +45,7 @@ enum class TypeIndex : uint8_t
AggregateFunction, AggregateFunction,
LowCardinality, LowCardinality,
Map, Map,
ObjectDeprecated,
Object, Object,
IPv4, IPv4,
IPv6, IPv6,

View File

@ -15,6 +15,7 @@
#include <Parsers/ASTFunction.h> #include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h> #include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTLiteral.h> #include <Parsers/ASTLiteral.h>
#include <base/find_symbols.h>
#include <IO/ReadBufferFromMemory.h> #include <IO/ReadBufferFromMemory.h>
namespace DB namespace DB
@ -67,7 +68,11 @@ static DataTypePtr create(const ASTPtr & arguments)
if (!argument || argument->name != "equals") if (!argument || argument->name != "equals")
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Dynamic data type argument should be in a form 'max_types=N'"); throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Dynamic data type argument should be in a form 'max_types=N'");
auto identifier_name = argument->arguments->children[0]->as<ASTIdentifier>()->name(); const auto * identifier = argument->arguments->children[0]->as<ASTIdentifier>();
if (!identifier)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected Dynamic type argument: {}. Expected expression 'max_types=N'", identifier->formatForErrorMessage());
auto identifier_name = identifier->name();
if (identifier_name != "max_types") if (identifier_name != "max_types")
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected identifier: {}. Dynamic data type argument should be in a form 'max_types=N'", identifier_name); throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected identifier: {}. Dynamic data type argument should be in a form 'max_types=N'", identifier_name);
@ -84,9 +89,53 @@ void registerDataTypeDynamic(DataTypeFactory & factory)
factory.registerDataType("Dynamic", create); factory.registerDataType("Dynamic", create);
} }
namespace
{
/// Split Dynamic subcolumn name into 2 parts: type name and subcolumn of this type.
/// We cannot simply split by '.' because type name can also contain dots. For example: Tuple(`a.b` UInt32).
/// But in all such cases this '.' will be inside back quotes. To split subcolumn name correctly
/// we search for the first '.' that is not inside back quotes.
std::pair<std::string_view, std::string_view> splitSubcolumnName(std::string_view subcolumn_name)
{
bool inside_quotes = false;
const char * pos = subcolumn_name.data();
const char * end = subcolumn_name.data() + subcolumn_name.size();
while (true)
{
pos = find_first_symbols<'`', '.', '\\'>(pos, end);
if (pos == end)
break;
if (*pos == '`')
{
inside_quotes = !inside_quotes;
++pos;
}
else if (*pos == '\\')
{
++pos;
}
else if (*pos == '.')
{
if (inside_quotes)
++pos;
else
break;
}
}
if (pos == end)
return {subcolumn_name, {}};
return {std::string_view(subcolumn_name.data(), pos), std::string_view(pos + 1, end)};
}
}
std::unique_ptr<IDataType::SubstreamData> DataTypeDynamic::getDynamicSubcolumnData(std::string_view subcolumn_name, const DB::IDataType::SubstreamData & data, bool throw_if_null) const std::unique_ptr<IDataType::SubstreamData> DataTypeDynamic::getDynamicSubcolumnData(std::string_view subcolumn_name, const DB::IDataType::SubstreamData & data, bool throw_if_null) const
{ {
auto [type_subcolumn_name, subcolumn_nested_name] = Nested::splitName(subcolumn_name); auto [type_subcolumn_name, subcolumn_nested_name] = splitSubcolumnName(subcolumn_name);
/// Check if requested subcolumn is a valid data type. /// Check if requested subcolumn is a valid data type.
auto subcolumn_type = DataTypeFactory::instance().tryGet(String(type_subcolumn_name)); auto subcolumn_type = DataTypeFactory::instance().tryGet(String(type_subcolumn_name));
if (!subcolumn_type) if (!subcolumn_type)

View File

@ -12,6 +12,9 @@ class DataTypeDynamic final : public IDataType
public: public:
static constexpr bool is_parametric = true; static constexpr bool is_parametric = true;
/// Don't change this constant, it can break backward compatibility.
static constexpr size_t DEFAULT_MAX_DYNAMIC_TYPES = 32;
explicit DataTypeDynamic(size_t max_dynamic_types_ = DEFAULT_MAX_DYNAMIC_TYPES); explicit DataTypeDynamic(size_t max_dynamic_types_ = DEFAULT_MAX_DYNAMIC_TYPES);
TypeIndex getTypeId() const override { return TypeIndex::Dynamic; } TypeIndex getTypeId() const override { return TypeIndex::Dynamic; }
@ -43,8 +46,6 @@ public:
size_t getMaxDynamicTypes() const { return max_dynamic_types; } size_t getMaxDynamicTypes() const { return max_dynamic_types; }
private: private:
static constexpr size_t DEFAULT_MAX_DYNAMIC_TYPES = 32;
SerializationPtr doGetDefaultSerialization() const override; SerializationPtr doGetDefaultSerialization() const override;
String doGetName() const override; String doGetName() const override;

View File

@ -273,9 +273,10 @@ DataTypeFactory::DataTypeFactory()
registerDataTypeDomainSimpleAggregateFunction(*this); registerDataTypeDomainSimpleAggregateFunction(*this);
registerDataTypeDomainGeo(*this); registerDataTypeDomainGeo(*this);
registerDataTypeMap(*this); registerDataTypeMap(*this);
registerDataTypeObject(*this); registerDataTypeObjectDeprecated(*this);
registerDataTypeVariant(*this); registerDataTypeVariant(*this);
registerDataTypeDynamic(*this); registerDataTypeDynamic(*this);
registerDataTypeJSON(*this);
} }
DataTypeFactory & DataTypeFactory::instance() DataTypeFactory & DataTypeFactory::instance()

View File

@ -99,8 +99,9 @@ void registerDataTypeLowCardinality(DataTypeFactory & factory);
void registerDataTypeDomainBool(DataTypeFactory & factory); void registerDataTypeDomainBool(DataTypeFactory & factory);
void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory); void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory);
void registerDataTypeDomainGeo(DataTypeFactory & factory); void registerDataTypeDomainGeo(DataTypeFactory & factory);
void registerDataTypeObject(DataTypeFactory & factory); void registerDataTypeObjectDeprecated(DataTypeFactory & factory);
void registerDataTypeVariant(DataTypeFactory & factory); void registerDataTypeVariant(DataTypeFactory & factory);
void registerDataTypeDynamic(DataTypeFactory & factory); void registerDataTypeDynamic(DataTypeFactory & factory);
void registerDataTypeJSON(DataTypeFactory & factory);
} }

View File

@ -1,83 +1,511 @@
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeFactory.h> #include <DataTypes/DataTypeFactory.h>
#include <DataTypes/Serializations/SerializationObject.h> #include <DataTypes/DataTypeObject.h>
#include <DataTypes/Serializations/SerializationJSON.h>
#include <DataTypes/Serializations/SerializationObjectTypedPath.h>
#include <DataTypes/Serializations/SerializationObjectDynamicPath.h>
#include <DataTypes/Serializations/SerializationSubObject.h>
#include <Columns/ColumnObject.h>
#include <Parsers/IAST.h> #include <Parsers/IAST.h>
#include <Parsers/ASTLiteral.h> #include <Parsers/ASTLiteral.h>
#include <Parsers/ASTDataType.h> #include <Parsers/ASTDataType.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTObjectTypeArgument.h>
#include <Parsers/ASTNameTypePair.h>
#include <Formats/JSONExtractTree.h>
#include <Interpreters/Context.h>
#include <Core/Settings.h>
#include <IO/Operators.h> #include <IO/Operators.h>
#if USE_SIMDJSON
#include <Common/JSONParsers/SimdJSONParser.h>
#endif
#if USE_RAPIDJSON
#include <Common/JSONParsers/RapidJSONParser.h>
#endif
#include <Common/JSONParsers/DummyJSONParser.h>
namespace DB namespace DB
{ {
namespace ErrorCodes namespace ErrorCodes
{ {
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int UNEXPECTED_AST_STRUCTURE; extern const int UNEXPECTED_AST_STRUCTURE;
extern const int BAD_ARGUMENTS;
} }
DataTypeObject::DataTypeObject(const String & schema_format_, bool is_nullable_) DataTypeObject::DataTypeObject(
: schema_format(Poco::toLower(schema_format_)) const SchemaFormat & schema_format_,
, is_nullable(is_nullable_) std::unordered_map<String, DataTypePtr> typed_paths_,
std::unordered_set<String> paths_to_skip_,
std::vector<String> path_regexps_to_skip_,
size_t max_dynamic_paths_,
size_t max_dynamic_types_)
: schema_format(schema_format_)
, typed_paths(std::move(typed_paths_))
, paths_to_skip(std::move(paths_to_skip_))
, path_regexps_to_skip(std::move(path_regexps_to_skip_))
, max_dynamic_paths(max_dynamic_paths_)
, max_dynamic_types(max_dynamic_types_)
{
for (const auto & [typed_path, type] : typed_paths)
{
for (const auto & path_to_skip : paths_to_skip)
{
if (typed_path.starts_with(path_to_skip))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Path '{}' is specified with the data type ('{}') and matches the SKIP path prefix '{}'", typed_path, type->getName(), path_to_skip);
}
for (const auto & path_regex_to_skip : path_regexps_to_skip)
{
if (re2::RE2::FullMatch(typed_path, re2::RE2(path_regex_to_skip)))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Path '{}' is specified with the data type ('{}') and matches the SKIP REGEXP '{}'", typed_path, type->getName(), path_regex_to_skip);
}
}
}
DataTypeObject::DataTypeObject(const DB::DataTypeObject::SchemaFormat & schema_format_, size_t max_dynamic_paths_, size_t max_dynamic_types_)
: schema_format(schema_format_)
, max_dynamic_paths(max_dynamic_paths_)
, max_dynamic_types(max_dynamic_types_)
{ {
} }
bool DataTypeObject::equals(const IDataType & rhs) const bool DataTypeObject::equals(const IDataType & rhs) const
{ {
if (const auto * object = typeid_cast<const DataTypeObject *>(&rhs)) if (const auto * object = typeid_cast<const DataTypeObject *>(&rhs))
return schema_format == object->schema_format && is_nullable == object->is_nullable; {
if (typed_paths.size() != object->typed_paths.size())
return false;
for (const auto & [path, type] : typed_paths)
{
auto it = object->typed_paths.find(path);
if (it == object->typed_paths.end())
return false;
if (!type->equals(*it->second))
return false;
}
return schema_format == object->schema_format && paths_to_skip == object->paths_to_skip && path_regexps_to_skip == object->path_regexps_to_skip
&& max_dynamic_types == object->max_dynamic_types && max_dynamic_paths == object->max_dynamic_paths;
}
return false; return false;
} }
SerializationPtr DataTypeObject::doGetDefaultSerialization() const SerializationPtr DataTypeObject::doGetDefaultSerialization() const
{ {
return getObjectSerialization(schema_format); std::unordered_map<String, SerializationPtr> typed_path_serializations;
typed_path_serializations.reserve(typed_paths.size());
for (const auto & [path, type] : typed_paths)
typed_path_serializations[path] = type->getDefaultSerialization();
switch (schema_format)
{
case SchemaFormat::JSON:
#ifdef USE_SIMDJSON
return std::make_shared<SerializationJSON<SimdJSONParser>>(
std::move(typed_path_serializations),
paths_to_skip,
path_regexps_to_skip,
buildJSONExtractTree<SimdJSONParser>(getPtr(), "JSON serialization"));
#elif USE_RAPIDJSON
return std::make_shared<SerializationJSON<RapidJSONParser>>(
std::move(typed_path_serializations),
paths_to_skip,
path_regexps_to_skip,
buildJSONExtractTree<RapidJSONParser>(getPtr(), "JSON serialization"));
#else
return std::make_shared<SerializationJSON<DummyJSONParser>>(
std::move(typed_path_serializations),
paths_to_skip,
path_regexps_to_skip,
buildJSONExtractTree<DummyJSONParser>(getPtr(), "JSON serialization"));
#endif
}
} }
String DataTypeObject::doGetName() const String DataTypeObject::doGetName() const
{ {
WriteBufferFromOwnString out; WriteBufferFromOwnString out;
if (is_nullable) out << magic_enum::enum_name(schema_format);
out << "Object(Nullable(" << quote << schema_format << "))"; bool first = true;
else auto write_separator = [&]()
out << "Object(" << quote << schema_format << ")"; {
if (!first)
{
out << ", ";
}
else
{
out << "(";
first = false;
}
};
if (max_dynamic_types != DataTypeDynamic::DEFAULT_MAX_DYNAMIC_TYPES)
{
write_separator();
out << "max_dynamic_types=" << max_dynamic_types;
}
if (max_dynamic_paths != DEFAULT_MAX_SEPARATELY_STORED_PATHS)
{
write_separator();
out << "max_dynamic_paths=" << max_dynamic_paths;
}
std::vector<String> sorted_typed_paths;
sorted_typed_paths.reserve(typed_paths.size());
for (const auto & [path, _] : typed_paths)
sorted_typed_paths.push_back(path);
std::sort(sorted_typed_paths.begin(), sorted_typed_paths.end());
for (const auto & path : sorted_typed_paths)
{
write_separator();
out << backQuoteIfNeed(path) << " " << typed_paths.at(path)->getName();
}
std::vector<String> sorted_skip_paths;
sorted_skip_paths.reserve(paths_to_skip.size());
for (const auto & skip_path : paths_to_skip)
sorted_skip_paths.push_back(skip_path);
std::sort(sorted_skip_paths.begin(), sorted_skip_paths.end());
for (const auto & skip_path : sorted_skip_paths)
{
write_separator();
out << "SKIP " << backQuoteIfNeed(skip_path);
}
for (const auto & skip_regexp : path_regexps_to_skip)
{
write_separator();
out << "SKIP REGEXP " << quoteString(skip_regexp);
}
if (!first)
out << ")";
return out.str(); return out.str();
} }
static DataTypePtr create(const ASTPtr & arguments) MutableColumnPtr DataTypeObject::createColumn() const
{ {
if (!arguments || arguments->children.size() != 1) std::unordered_map<String, MutableColumnPtr> typed_path_columns;
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, typed_path_columns.reserve(typed_paths.size());
"Object data type family must have one argument - name of schema format"); for (const auto & [path, type] : typed_paths)
typed_path_columns[path] = type->createColumn();
ASTPtr schema_argument = arguments->children[0]; return ColumnObject::create(std::move(typed_path_columns), max_dynamic_paths, max_dynamic_types);
bool is_nullable = false; }
if (const auto * type = schema_argument->as<ASTDataType>()) namespace
{
/// It is possible to have nested JSON object inside Dynamic. For example when we have an array of JSON objects.
/// During type inference in parsing in case of creating nested JSON objects, we reduce max_dynamic_paths/max_dynamic_types by factors
/// NESTED_OBJECT_MAX_DYNAMIC_PATHS_REDUCE_FACTOR/NESTED_OBJECT_MAX_DYNAMIC_TYPES_REDUCE_FACTOR.
/// So the type name will actually be JSON(max_dynamic_paths=N, max_dynamic_types=M). But we want the user to be able to query it
/// using json.array.:`Array(JSON)`.some.path without specifying max_dynamic_paths/max_dynamic_types.
/// To support it, we do a trick - we replace JSON name in subcolumn to JSON(max_dynamic_paths=N, max_dynamic_types=M), because we know
/// the exact values of max_dynamic_paths/max_dynamic_types for it.
void replaceJSONTypeNameIfNeeded(String & type_name, size_t max_dynamic_paths, size_t max_dynamic_types)
{
auto pos = type_name.find("JSON");
while (pos != String::npos)
{ {
if (type->name != "Nullable" || type->arguments->children.size() != 1) /// Replace only if we don't already have parameters in JSON type declaration.
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, if (pos + 4 == type_name.size() || type_name[pos + 4] != '(')
"Expected 'Nullable(<schema_name>)' as parameter for type Object (function: {})", type->name); type_name.replace(
pos,
4,
fmt::format(
"JSON(max_dynamic_paths={}, max_dynamic_types={})",
max_dynamic_paths / DataTypeObject::NESTED_OBJECT_MAX_DYNAMIC_PATHS_REDUCE_FACTOR,
max_dynamic_types / DataTypeObject::NESTED_OBJECT_MAX_DYNAMIC_TYPES_REDUCE_FACTOR));
pos = type_name.find("JSON", pos + 4);
}
}
schema_argument = type->arguments->children[0]; /// JSON subcolumn name with Dynamic type subcolumn looks like this:
is_nullable = true; /// "json.some.path.:`Type_name`.some.subcolumn".
/// We back quoted type name during identifier parsing so we can distinguish type subcolumn and path element ":TypeName".
std::pair<String, String> splitPathAndDynamicTypeSubcolumn(std::string_view subcolumn_name, size_t max_dynamic_paths, size_t max_dynamic_types)
{
/// Try to find dynamic type subcolumn in a form .:`Type`.
auto pos = subcolumn_name.find(".:`");
if (pos == std::string_view::npos)
return {String(subcolumn_name), ""};
ReadBufferFromMemory buf(subcolumn_name.substr(pos + 2));
String dynamic_subcolumn;
/// Try to read back quoted type name.
if (!tryReadBackQuotedString(dynamic_subcolumn, buf))
return {String(subcolumn_name), ""};
replaceJSONTypeNameIfNeeded(dynamic_subcolumn, max_dynamic_paths, max_dynamic_types);
/// If there is more data in the buffer - it's subcolumn of a type, append it to the type name.
if (!buf.eof())
dynamic_subcolumn += String(buf.position(), buf.available());
return {String(subcolumn_name.substr(0, pos)), dynamic_subcolumn};
}
/// Sub-object subcolumn in JSON path always looks like "^`some`.path.path".
/// We back quote first path element after `^` so we can distinguish sub-object subcolumn and path element "^path".
std::optional<String> tryGetSubObjectSubcolumn(std::string_view subcolumn_name)
{
if (!subcolumn_name.starts_with("^`"))
return std::nullopt;
ReadBufferFromMemory buf(subcolumn_name.data() + 1);
String path;
/// Try to read back-quoted first path element.
if (!tryReadBackQuotedString(path, buf))
return std::nullopt;
/// Add remaining path elements if any.
return path + String(buf.position(), buf.available());
}
/// Return sub-path by specified prefix.
/// For example, for prefix a.b:
/// a.b.c.d -> c.d, a.b.c -> c
String getSubPath(const String & path, const String & prefix)
{
return path.substr(prefix.size() + 1);
}
std::string_view getSubPath(std::string_view path, const String & prefix)
{
return path.substr(prefix.size() + 1);
}
}
std::unique_ptr<ISerialization::SubstreamData> DataTypeObject::getDynamicSubcolumnData(std::string_view subcolumn_name, const SubstreamData & data, bool throw_if_null) const
{
/// Check if it's sub-object subcolumn.
/// In this case we should return JSON column with all paths that are inside specified object prefix.
/// For example, if we have {"a" : {"b" : {"c" : {"d" : 10, "e" : "Hello"}, "f" : [1, 2, 3]}}} and subcolumn ^a.b
/// we should return JSON column with data {"c" : {"d" : 10, "e" : Hello}, "f" : [1, 2, 3]}
if (auto sub_object_subcolumn = tryGetSubObjectSubcolumn(subcolumn_name))
{
const String & prefix = *sub_object_subcolumn;
/// Collect new typed paths.
std::unordered_map<String, DataTypePtr> typed_sub_paths;
/// Collect serializations for typed paths. They will be needed for sub-object subcolumn deserialization.
std::unordered_map<String, SerializationPtr> typed_paths_serializations;
for (const auto & [path, type] : typed_paths)
{
if (path.starts_with(prefix) && path.size() != prefix.size())
{
typed_sub_paths[getSubPath(path, prefix)] = type;
typed_paths_serializations[path] = type->getDefaultSerialization();
}
}
std::unique_ptr<SubstreamData> res = std::make_unique<SubstreamData>(std::make_shared<SerializationSubObject>(prefix, typed_paths_serializations));
/// Keep all current constraints like limits and skip paths/prefixes/regexps.
res->type = std::make_shared<DataTypeObject>(schema_format, typed_sub_paths, paths_to_skip, path_regexps_to_skip, max_dynamic_paths, max_dynamic_types);
/// If column was provided, we should create a column for the requested subcolumn.
if (data.column)
{
const auto & object_column = assert_cast<const ColumnObject &>(*data.column);
auto result_column = res->type->createColumn();
auto & result_object_column = assert_cast<ColumnObject &>(*result_column);
/// Iterate over all typed/dynamic/shared data paths and collect all paths with specified prefix.
auto & result_typed_columns = result_object_column.getTypedPaths();
for (const auto & [path, column] : object_column.getTypedPaths())
{
if (path.starts_with(prefix) && path.size() != prefix.size())
result_typed_columns[getSubPath(path, prefix)] = column;
}
auto & result_dynamic_columns = result_object_column.getDynamicPaths();
auto & result_dynamic_columns_ptrs = result_object_column.getDynamicPathsPtrs();
for (const auto & [path, column] : object_column.getDynamicPaths())
{
if (path.starts_with(prefix) && path.size() != prefix.size())
{
auto sub_path = getSubPath(path, prefix);
result_dynamic_columns[sub_path] = column;
result_dynamic_columns_ptrs[sub_path] = assert_cast<ColumnDynamic *>(result_dynamic_columns[sub_path].get());
}
}
const auto & shared_data_offsets = object_column.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = object_column.getSharedDataPathsAndValues();
auto & result_shared_data_offsets = result_object_column.getSharedDataOffsets();
result_shared_data_offsets.reserve(shared_data_offsets.size());
auto [result_shared_data_paths, result_shared_data_values] = result_object_column.getSharedDataPathsAndValues();
for (size_t i = 0; i != shared_data_offsets.size(); ++i)
{
size_t start = shared_data_offsets[ssize_t(i) - 1];
size_t end = shared_data_offsets[ssize_t(i)];
size_t lower_bound_index = ColumnObject::findPathLowerBoundInSharedData(prefix, *shared_data_paths, start, end);
for (; lower_bound_index != end; ++lower_bound_index)
{
auto path = shared_data_paths->getDataAt(lower_bound_index).toView();
if (!path.starts_with(prefix))
break;
/// Don't include path that is equal to the prefix.
if (path.size() != prefix.size())
{
auto sub_path = getSubPath(path, prefix);
result_shared_data_paths->insertData(sub_path.data(), sub_path.size());
result_shared_data_values->insertFrom(*shared_data_values, lower_bound_index);
}
}
result_shared_data_offsets.push_back(result_shared_data_paths->size());
}
res->column = std::move(result_column);
}
return res;
} }
const auto * literal = schema_argument->as<ASTLiteral>(); /// Split requested subcolumn to the JSON path and Dynamic type subcolumn.
if (!literal || literal->value.getType() != Field::Types::String) auto [path, path_subcolumn] = splitPathAndDynamicTypeSubcolumn(subcolumn_name, max_dynamic_paths, max_dynamic_types);
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, std::unique_ptr<SubstreamData> res;
"Object data type family must have a const string as its schema name parameter"); if (auto it = typed_paths.find(path); it != typed_paths.end())
{
res = std::make_unique<SubstreamData>(it->second->getDefaultSerialization());
res->type = it->second;
}
else
{
res = std::make_unique<SubstreamData>(std::make_shared<SerializationDynamic>());
res->type = std::make_shared<DataTypeDynamic>();
}
return std::make_shared<DataTypeObject>(literal->value.safeGet<const String &>(), is_nullable); /// If column was provided, we should create a column for requested subcolumn.
if (data.column)
{
const auto & object_column = assert_cast<const ColumnObject &>(*data.column);
/// Try to find requested path in typed paths.
if (auto typed_it = object_column.getTypedPaths().find(path); typed_it != object_column.getTypedPaths().end())
{
res->column = typed_it->second;
}
/// Try to find requested path in dynamic paths.
else if (auto dynamic_it = object_column.getDynamicPaths().find(path); dynamic_it != object_column.getDynamicPaths().end())
{
res->column = dynamic_it->second;
}
/// Extract values of requested path from shared data.
else
{
auto dynamic_column = ColumnDynamic::create(max_dynamic_types);
dynamic_column->reserve(object_column.size());
ColumnObject::fillPathColumnFromSharedData(*dynamic_column, path, object_column.getSharedDataPtr(), 0, object_column.size());
res->column = std::move(dynamic_column);
}
}
/// Get subcolumn for Dynamic type if needed.
if (!path_subcolumn.empty())
{
res = res->type->getSubcolumnData(path_subcolumn, *res, throw_if_null);
if (!res)
return nullptr;
}
if (typed_paths.contains(path))
res->serialization = std::make_shared<SerializationObjectTypedPath>(res->serialization, path);
else
res->serialization = std::make_shared<SerializationObjectDynamicPath>(res->serialization, path, path_subcolumn, max_dynamic_types);
return res;
} }
void registerDataTypeObject(DataTypeFactory & factory) static DataTypePtr createObject(const ASTPtr & arguments, const DataTypeObject::SchemaFormat & schema_format)
{ {
factory.registerDataType("Object", create); if (!arguments || arguments->children.empty())
factory.registerSimpleDataType("JSON", return std::make_shared<DataTypeObject>(schema_format);
[] { return std::make_shared<DataTypeObject>("JSON", false); },
DataTypeFactory::Case::Insensitive); std::unordered_map<String, DataTypePtr> typed_paths;
std::unordered_set<String> paths_to_skip;
std::vector<String> path_regexps_to_skip;
size_t max_dynamic_types = DataTypeDynamic::DEFAULT_MAX_DYNAMIC_TYPES;
size_t max_dynamic_paths = DataTypeObject::DEFAULT_MAX_SEPARATELY_STORED_PATHS;
for (const auto & argument : arguments->children)
{
const auto * object_type_argument = argument->as<ASTObjectTypeArgument>();
if (object_type_argument->parameter)
{
const auto * function = object_type_argument->parameter->as<ASTFunction>();
if (!function || function->name != "equals")
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected parameter in {} type arguments: {}", magic_enum::enum_name(schema_format), function->formatForErrorMessage());
const auto * identifier = function->arguments->children[0]->as<ASTIdentifier>();
if (!identifier)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected {} type argument: {}. Expected expression 'max_dynamic_types=N' or 'max_dynamic_paths=N'", magic_enum::enum_name(schema_format), function->formatForErrorMessage());
auto identifier_name = identifier->name();
if (identifier_name != "max_dynamic_types" && identifier_name != "max_dynamic_paths")
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected parameter in {} type arguments: {}. Expected 'max_dynamic_types' or `max_dynamic_paths`", magic_enum::enum_name(schema_format), identifier_name);
auto * literal = function->arguments->children[1]->as<ASTLiteral>();
/// Is 1000000 a good maximum for max paths?
size_t max_value = identifier_name == "max_dynamic_types" ? ColumnDynamic::MAX_DYNAMIC_TYPES_LIMIT : 1000000;
if (!literal || literal->value.getType() != Field::Types::UInt64 || literal->value.safeGet<UInt64>() > max_value)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "'{}' parameter for {} type should be a positive integer between 0 and {}. Got {}", identifier_name, magic_enum::enum_name(schema_format), max_value, function->arguments->children[1]->formatForErrorMessage());
if (identifier_name == "max_dynamic_types")
max_dynamic_types = literal->value.safeGet<UInt64>();
else
max_dynamic_paths = literal->value.safeGet<UInt64>();
}
else if (object_type_argument->path_with_type)
{
const auto * path_with_type = object_type_argument->path_with_type->as<ASTNameTypePair>();
auto data_type = DataTypeFactory::instance().get(path_with_type->type);
if (typed_paths.contains(path_with_type->name))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Found duplicated path with type: {}", path_with_type->name);
typed_paths.emplace(path_with_type->name, data_type);
}
else if (object_type_argument->skip_path)
{
const auto * identifier = object_type_argument->skip_path->as<ASTIdentifier>();
if (!identifier)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected AST in SKIP section of {} type arguments: {}. Expected identifier with path name", magic_enum::enum_name(schema_format), object_type_argument->skip_path->formatForErrorMessage());
paths_to_skip.insert(identifier->name());
}
else if (object_type_argument->skip_path_regexp)
{
const auto * literal = object_type_argument->skip_path_regexp->as<ASTLiteral>();
if (!literal || literal->value.getType() != Field::Types::String)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected AST in SKIP section of {} type arguments: {}. Expected identifier with path name", magic_enum::enum_name(schema_format), object_type_argument->skip_path->formatForErrorMessage());
path_regexps_to_skip.push_back(literal->value.safeGet<String>());
}
}
std::sort(path_regexps_to_skip.begin(), path_regexps_to_skip.end());
return std::make_shared<DataTypeObject>(schema_format, std::move(typed_paths), std::move(paths_to_skip), std::move(path_regexps_to_skip), max_dynamic_paths, max_dynamic_types);
}
static DataTypePtr createJSON(const ASTPtr & arguments)
{
return createObject(arguments, DataTypeObject::SchemaFormat::JSON);
}
void registerDataTypeJSON(DataTypeFactory & factory)
{
if (!Context::getGlobalContextInstance()->getSettingsRef().use_json_alias_for_old_object_type)
factory.registerDataType("JSON", createJSON, DataTypeFactory::Case::Insensitive);
} }
} }

View File

@ -1,48 +1,80 @@
#pragma once #pragma once
#include <DataTypes/IDataType.h> #include <DataTypes/IDataType.h>
#include <DataTypes/DataTypeDynamic.h>
#include <Core/Field.h> #include <Core/Field.h>
#include <Columns/ColumnObject.h> #include <Columns/ColumnObjectDeprecated.h>
#include <Common/re2.h>
namespace DB namespace DB
{ {
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
class DataTypeObject : public IDataType class DataTypeObject : public IDataType
{ {
private:
String schema_format;
bool is_nullable;
public: public:
DataTypeObject(const String & schema_format_, bool is_nullable_); enum class SchemaFormat
{
JSON = 0,
};
/// Don't change these constants, it can break backward compatibility.
static constexpr size_t DEFAULT_MAX_SEPARATELY_STORED_PATHS = 1024;
static constexpr size_t NESTED_OBJECT_MAX_DYNAMIC_PATHS_REDUCE_FACTOR = 4;
static constexpr size_t NESTED_OBJECT_MAX_DYNAMIC_TYPES_REDUCE_FACTOR = 2;
explicit DataTypeObject(
const SchemaFormat & schema_format_,
std::unordered_map<String, DataTypePtr> typed_paths_ = {},
std::unordered_set<String> paths_to_skip_ = {},
std::vector<String> path_regexps_to_skip_ = {},
size_t max_dynamic_paths_ = DEFAULT_MAX_SEPARATELY_STORED_PATHS,
size_t max_dynamic_types_ = DataTypeDynamic::DEFAULT_MAX_DYNAMIC_TYPES);
DataTypeObject(const SchemaFormat & schema_format_, size_t max_dynamic_paths_, size_t max_dynamic_types_);
const char * getFamilyName() const override { return "Object"; } const char * getFamilyName() const override { return "Object"; }
String doGetName() const override; String doGetName() const override;
TypeIndex getTypeId() const override { return TypeIndex::Object; } TypeIndex getTypeId() const override { return TypeIndex::Object; }
MutableColumnPtr createColumn() const override { return ColumnObject::create(is_nullable); } MutableColumnPtr createColumn() const override;
Field getDefault() const override Field getDefault() const override { return Object(); }
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDefault() is not implemented for data type {}", getName());
}
bool haveSubtypes() const override { return false; }
bool equals(const IDataType & rhs) const override;
bool isParametric() const override { return true; } bool isParametric() const override { return true; }
bool hasDynamicSubcolumnsDeprecated() const override { return true; } bool canBeInsideNullable() const override { return false; }
bool supportsSparseSerialization() const override { return false; }
bool canBeInsideSparseColumns() const override { return false; }
bool isComparable() const override { return false; }
bool haveSubtypes() const override { return false; }
bool equals(const IDataType & rhs) const override;
bool hasDynamicSubcolumnsData() const override { return true; }
std::unique_ptr<SubstreamData> getDynamicSubcolumnData(std::string_view subcolumn_name, const SubstreamData & data, bool throw_if_null) const override;
SerializationPtr doGetDefaultSerialization() const override; SerializationPtr doGetDefaultSerialization() const override;
bool hasNullableSubcolumns() const { return is_nullable; } const SchemaFormat & getSchemaFormat() const { return schema_format; }
const std::unordered_map<String, DataTypePtr> & getTypedPaths() const { return typed_paths; }
const std::unordered_set<String> & getPathsToSkip() const { return paths_to_skip; }
const std::vector<String> & getPathRegexpsToSkip() const { return path_regexps_to_skip; }
const String & getSchemaFormat() const { return schema_format; } size_t getMaxDynamicTypes() const { return max_dynamic_types; }
size_t getMaxDynamicPaths() const { return max_dynamic_paths; }
private:
SchemaFormat schema_format;
/// Set of paths with types that were specified in type declaration.
std::unordered_map<String, DataTypePtr> typed_paths;
/// Set of paths that should be skipped during data parsing.
std::unordered_set<String> paths_to_skip;
/// List of regular expressions that should be used to skip paths during data parsing.
std::vector<String> path_regexps_to_skip;
/// Limit on the number of paths that can be stored as subcolumn.
size_t max_dynamic_paths;
/// Limit of dynamic types that should be used for Dynamic columns.
size_t max_dynamic_types;
}; };
} }

View File

@ -0,0 +1,87 @@
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/Serializations/SerializationObjectDeprecated.h>
#include <Parsers/IAST.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTDataType.h>
#include <IO/Operators.h>
#include <Interpreters/Context.h>
#include <Core/Settings.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int UNEXPECTED_AST_STRUCTURE;
}
DataTypeObjectDeprecated::DataTypeObjectDeprecated(const String & schema_format_, bool is_nullable_)
: schema_format(Poco::toLower(schema_format_))
, is_nullable(is_nullable_)
{
}
bool DataTypeObjectDeprecated::equals(const IDataType & rhs) const
{
if (const auto * object = typeid_cast<const DataTypeObjectDeprecated *>(&rhs))
return schema_format == object->schema_format && is_nullable == object->is_nullable;
return false;
}
SerializationPtr DataTypeObjectDeprecated::doGetDefaultSerialization() const
{
return getObjectSerialization(schema_format);
}
String DataTypeObjectDeprecated::doGetName() const
{
WriteBufferFromOwnString out;
if (is_nullable)
out << "Object(Nullable(" << quote << schema_format << "))";
else
out << "Object(" << quote << schema_format << ")";
return out.str();
}
static DataTypePtr create(const ASTPtr & arguments)
{
if (!arguments || arguments->children.size() != 1)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Object data type family must have one argument - name of schema format");
ASTPtr schema_argument = arguments->children[0];
bool is_nullable = false;
if (const auto * type = schema_argument->as<ASTDataType>())
{
if (type->name != "Nullable" || type->arguments->children.size() != 1)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE,
"Expected 'Nullable(<schema_name>)' as parameter for type Object (function: {})", type->name);
schema_argument = type->arguments->children[0];
is_nullable = true;
}
const auto * literal = schema_argument->as<ASTLiteral>();
if (!literal || literal->value.getType() != Field::Types::String)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE,
"Object data type family must have a const string as its schema name parameter");
return std::make_shared<DataTypeObjectDeprecated>(literal->value.safeGet<const String &>(), is_nullable);
}
void registerDataTypeObjectDeprecated(DataTypeFactory & factory)
{
factory.registerDataType("Object", create);
if (Context::getGlobalContextInstance()->getSettingsRef().use_json_alias_for_old_object_type)
factory.registerSimpleDataType("JSON",
[] { return std::make_shared<DataTypeObjectDeprecated>("JSON", false); },
DataTypeFactory::Case::Insensitive);
}
}

View File

@ -0,0 +1,48 @@
#pragma once
#include <DataTypes/IDataType.h>
#include <Core/Field.h>
#include <Columns/ColumnObjectDeprecated.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
class DataTypeObjectDeprecated : public IDataType
{
private:
String schema_format;
bool is_nullable;
public:
DataTypeObjectDeprecated(const String & schema_format_, bool is_nullable_);
const char * getFamilyName() const override { return "Object"; }
String doGetName() const override;
TypeIndex getTypeId() const override { return TypeIndex::ObjectDeprecated; }
MutableColumnPtr createColumn() const override { return ColumnObjectDeprecated::create(is_nullable); }
Field getDefault() const override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDefault() is not implemented for data type {}", getName());
}
bool haveSubtypes() const override { return false; }
bool equals(const IDataType & rhs) const override;
bool isParametric() const override { return true; }
bool hasDynamicSubcolumnsDeprecated() const override { return true; }
SerializationPtr doGetDefaultSerialization() const override;
bool hasNullableSubcolumns() const { return is_nullable; }
const String & getSchemaFormat() const { return schema_format; }
};
}

View File

@ -11,7 +11,7 @@
#include <DataTypes/Serializations/SerializationTuple.h> #include <DataTypes/Serializations/SerializationTuple.h>
#include <DataTypes/Serializations/SerializationNamed.h> #include <DataTypes/Serializations/SerializationNamed.h>
#include <DataTypes/Serializations/SerializationInfoTuple.h> #include <DataTypes/Serializations/SerializationInfoTuple.h>
#include <DataTypes/Serializations/SerializationVariantElement.h> #include <DataTypes/Serializations/SerializationWrapper.h>
#include <DataTypes/NestedUtils.h> #include <DataTypes/NestedUtils.h>
#include <Parsers/IAST.h> #include <Parsers/IAST.h>
#include <Parsers/ASTNameTypePair.h> #include <Parsers/ASTNameTypePair.h>

View File

@ -9,6 +9,7 @@
#include <DataTypes/DataTypeFunction.h> #include <DataTypes/DataTypeFunction.h>
#include <DataTypes/DataTypeLowCardinality.h> #include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeMap.h> #include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeVariant.h> #include <DataTypes/DataTypeVariant.h>
#include <DataTypes/DataTypeString.h> #include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeUUID.h> #include <DataTypes/DataTypeUUID.h>
@ -94,8 +95,13 @@ enum class BinaryTypeIndex : uint8_t
Bool = 0x2D, Bool = 0x2D,
SimpleAggregateFunction = 0x2E, SimpleAggregateFunction = 0x2E,
Nested = 0x2F, Nested = 0x2F,
JSON = 0x30,
}; };
/// In future we can introduce more arguments in the JSON data type definition.
/// To support such changes, use versioning in the serialization of JSON type.
const UInt8 TYPE_JSON_SERIALIZATION_VERSION = 0;
BinaryTypeIndex getBinaryTypeIndex(const DataTypePtr & type) BinaryTypeIndex getBinaryTypeIndex(const DataTypePtr & type)
{ {
/// By default custom types don't have their own BinaryTypeIndex. /// By default custom types don't have their own BinaryTypeIndex.
@ -202,7 +208,7 @@ BinaryTypeIndex getBinaryTypeIndex(const DataTypePtr & type)
return BinaryTypeIndex::LowCardinality; return BinaryTypeIndex::LowCardinality;
case TypeIndex::Map: case TypeIndex::Map:
return BinaryTypeIndex::Map; return BinaryTypeIndex::Map;
case TypeIndex::Object: case TypeIndex::ObjectDeprecated:
/// Object type will be deprecated and replaced by new implementation. No need to support it here. /// Object type will be deprecated and replaced by new implementation. No need to support it here.
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Binary encoding of type Object is not supported"); throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Binary encoding of type Object is not supported");
case TypeIndex::IPv4: case TypeIndex::IPv4:
@ -216,6 +222,15 @@ BinaryTypeIndex getBinaryTypeIndex(const DataTypePtr & type)
/// JSONPaths is used only during schema inference and cannot be used anywhere else. /// JSONPaths is used only during schema inference and cannot be used anywhere else.
case TypeIndex::JSONPaths: case TypeIndex::JSONPaths:
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Binary encoding of type JSONPaths is not supported"); throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Binary encoding of type JSONPaths is not supported");
case TypeIndex::Object:
{
const auto & object_type = assert_cast<const DataTypeObject &>(*type);
switch (object_type.getSchemaFormat())
{
case DataTypeObject::SchemaFormat::JSON:
return BinaryTypeIndex::JSON;
}
}
} }
} }
@ -480,6 +495,30 @@ void encodeDataType(const DataTypePtr & type, WriteBuffer & buf)
writeStringBinary(type_name, buf); writeStringBinary(type_name, buf);
break; break;
} }
case BinaryTypeIndex::JSON:
{
const auto & object_type = assert_cast<const DataTypeObject &>(*type);
/// Write version of the serialization because we can add new arguments in the JSON type.
writeBinary(TYPE_JSON_SERIALIZATION_VERSION, buf);
writeVarUInt(object_type.getMaxDynamicPaths(), buf);
writeBinary(UInt8(object_type.getMaxDynamicTypes()), buf);
const auto & typed_paths = object_type.getTypedPaths();
writeVarUInt(typed_paths.size(), buf);
for (const auto & [path, path_type] : typed_paths)
{
writeStringBinary(path, buf);
encodeDataType(path_type, buf);
}
const auto & paths_to_skip = object_type.getPathsToSkip();
writeVarUInt(paths_to_skip.size(), buf);
for (const auto & path : paths_to_skip)
writeStringBinary(path, buf);
const auto & path_regexps_to_skip = object_type.getPathRegexpsToSkip();
writeVarUInt(path_regexps_to_skip.size(), buf);
for (const auto & regexp : path_regexps_to_skip)
writeStringBinary(regexp, buf);
break;
}
default: default:
break; break;
} }
@ -691,6 +730,54 @@ DataTypePtr decodeDataType(ReadBuffer & buf)
readStringBinary(type_name, buf); readStringBinary(type_name, buf);
return DataTypeFactory::instance().get(type_name); return DataTypeFactory::instance().get(type_name);
} }
case BinaryTypeIndex::JSON:
{
UInt8 serialization_version;
readBinary(serialization_version, buf);
if (serialization_version > TYPE_JSON_SERIALIZATION_VERSION)
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected version of JSON type binary encoding");
size_t max_dynamic_paths;
readVarUInt(max_dynamic_paths, buf);
UInt8 max_dynamic_types;
readBinary(max_dynamic_types, buf);
size_t typed_paths_size;
readVarUInt(typed_paths_size, buf);
std::unordered_map<String, DataTypePtr> typed_paths;
for (size_t i = 0; i != typed_paths_size; ++i)
{
String path;
readStringBinary(path, buf);
typed_paths[path] = decodeDataType(buf);
}
size_t paths_to_skip_size;
readVarUInt(paths_to_skip_size, buf);
std::unordered_set<String> paths_to_skip;
paths_to_skip.reserve(paths_to_skip_size);
for (size_t i = 0; i != paths_to_skip_size; ++i)
{
String path;
readStringBinary(path, buf);
paths_to_skip.insert(path);
}
size_t path_regexps_to_skip_size;
readVarUInt(path_regexps_to_skip_size, buf);
std::vector<String> path_regexps_to_skip;
path_regexps_to_skip.reserve(path_regexps_to_skip_size);
for (size_t i = 0; i != path_regexps_to_skip_size; ++i)
{
String regexp;
readStringBinary(regexp, buf);
path_regexps_to_skip.push_back(regexp);
}
return std::make_shared<DataTypeObject>(
DataTypeObject::SchemaFormat::JSON,
typed_paths,
paths_to_skip,
path_regexps_to_skip,
max_dynamic_paths,
max_dynamic_types);
}
} }
throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown type code: {0:#04x}", UInt64(type)); throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown type code: {0:#04x}", UInt64(type));

View File

@ -8,58 +8,59 @@ namespace DB
/** /**
Binary encoding for ClickHouse data types: Binary encoding for ClickHouse data types:
|------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| ClickHouse data type | Binary encoding | | ClickHouse data type | Binary encoding |
|------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Nothing | 0x00 | | Nothing | 0x00 |
| UInt8 | 0x01 | | UInt8 | 0x01 |
| UInt16 | 0x02 | | UInt16 | 0x02 |
| UInt32 | 0x03 | | UInt32 | 0x03 |
| UInt64 | 0x04 | | UInt64 | 0x04 |
| UInt128 | 0x05 | | UInt128 | 0x05 |
| UInt256 | 0x06 | | UInt256 | 0x06 |
| Int8 | 0x07 | | Int8 | 0x07 |
| Int16 | 0x08 | | Int16 | 0x08 |
| Int32 | 0x09 | | Int32 | 0x09 |
| Int64 | 0x0A | | Int64 | 0x0A |
| Int128 | 0x0B | | Int128 | 0x0B |
| Int256 | 0x0C | | Int256 | 0x0C |
| Float32 | 0x0D | | Float32 | 0x0D |
| Float64 | 0x0E | | Float64 | 0x0E |
| Date | 0x0F | | Date | 0x0F |
| Date32 | 0x10 | | Date32 | 0x10 |
| DateTime | 0x11 | | DateTime | 0x11 |
| DateTime(time_zone) | 0x12<var_uint_time_zone_name_size><time_zone_name_data> | | DateTime(time_zone) | 0x12<var_uint_time_zone_name_size><time_zone_name_data> |
| DateTime64(P) | 0x13<uint8_precision> | | DateTime64(P) | 0x13<uint8_precision> |
| DateTime64(P, time_zone) | 0x14<uint8_precision><var_uint_time_zone_name_size><time_zone_name_data> | | DateTime64(P, time_zone) | 0x14<uint8_precision><var_uint_time_zone_name_size><time_zone_name_data> |
| String | 0x15 | | String | 0x15 |
| FixedString(N) | 0x16<var_uint_size> | | FixedString(N) | 0x16<var_uint_size> |
| Enum8 | 0x17<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int8_value_1>...<var_uint_name_size_N><name_data_N><int8_value_N> | | Enum8 | 0x17<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int8_value_1>...<var_uint_name_size_N><name_data_N><int8_value_N> |
| Enum16 | 0x18<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int16_little_endian_value_1>...><var_uint_name_size_N><name_data_N><int16_little_endian_value_N> | | Enum16 | 0x18<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><int16_little_endian_value_1>...><var_uint_name_size_N><name_data_N><int16_little_endian_value_N> |
| Decimal32(P, S) | 0x19<uint8_precision><uint8_scale> | | Decimal32(P, S) | 0x19<uint8_precision><uint8_scale> |
| Decimal64(P, S) | 0x1A<uint8_precision><uint8_scale> | | Decimal64(P, S) | 0x1A<uint8_precision><uint8_scale> |
| Decimal128(P, S) | 0x1B<uint8_precision><uint8_scale> | | Decimal128(P, S) | 0x1B<uint8_precision><uint8_scale> |
| Decimal256(P, S) | 0x1C<uint8_precision><uint8_scale> | | Decimal256(P, S) | 0x1C<uint8_precision><uint8_scale> |
| UUID | 0x1D | | UUID | 0x1D |
| Array(T) | 0x1E<nested_type_encoding> | | Array(T) | 0x1E<nested_type_encoding> |
| Tuple(T1, ..., TN) | 0x1F<var_uint_number_of_elements><nested_type_encoding_1>...<nested_type_encoding_N> | | Tuple(T1, ..., TN) | 0x1F<var_uint_number_of_elements><nested_type_encoding_1>...<nested_type_encoding_N> |
| Tuple(name1 T1, ..., nameN TN) | 0x20<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N> | | Tuple(name1 T1, ..., nameN TN) | 0x20<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N> |
| Set | 0x21 | | Set| 0x21 |
| Interval | 0x22<interval_kind> | | Interval | 0x22<interval_kind> |
| Nullable(T) | 0x23<nested_type_encoding> | | Nullable(T) | 0x23<nested_type_encoding> |
| Function | 0x24<var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N><return_type_encoding> | | Function | 0x24<var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N><return_type_encoding> |
| AggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN) | 0x25<var_uint_version><var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N> | | AggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN) | 0x25<var_uint_version><var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N> |
| LowCardinality(T) | 0x26<nested_type_encoding> | | LowCardinality(T) | 0x26<nested_type_encoding> |
| Map(K, V) | 0x27<key_type_encoding><value_type_encoding> | | Map(K, V) | 0x27<key_type_encoding><value_type_encoding> |
| IPv4 | 0x28 | | IPv4 | 0x28 |
| IPv6 | 0x29 | | IPv6 | 0x29 |
| Variant(T1, ..., TN) | 0x2A<var_uint_number_of_variants><variant_type_encoding_1>...<variant_type_encoding_N> | | Variant(T1, ..., TN) | 0x2A<var_uint_number_of_variants><variant_type_encoding_1>...<variant_type_encoding_N> |
| Dynamic(max_types=N) | 0x2B<uint8_max_types> | | Dynamic(max_types=N) | 0x2B<uint8_max_types> |
| Custom type (Ring, Polygon, etc) | 0x2C<var_uint_type_name_size><type_name_data> | | Custom type (Ring, Polygon, etc) | 0x2C<var_uint_type_name_size><type_name_data> |
| Bool | 0x2D | | Bool | 0x2D |
| SimpleAggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN) | 0x2E<var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N> | | SimpleAggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN) | 0x2E<var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N> |
| Nested(name1 T1, ..., nameN TN) | 0x2F<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N> | | Nested(name1 T1, ..., nameN TN) | 0x2F<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N> |
|------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | JSON(max_dynamic_paths=N, max_dynamic_types=M, path Type, SKIP skip_path, SKIP REGEXP skip_path_regexp) | 0x30<uint8_serialization_version><var_int_max_dynamic_paths><uint8_max_dynamic_types><var_uint_number_of_typed_paths><var_uint_path_name_size_1><path_name_data_1><encoded_type_1>...<var_uint_number_of_skip_paths><var_uint_skip_path_size_1><skip_path_data_1>...<var_uint_number_of_skip_path_regexps><var_uint_skip_path_regexp_size_1><skip_path_data_regexp_1>... |
|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
Interval kind binary encoding: Interval kind binary encoding:
|---------------|-----------------| |---------------|-----------------|

View File

@ -178,8 +178,7 @@ DataTypePtr FieldToDataType<on_error>::operator() (const Map & map) const
template <LeastSupertypeOnError on_error> template <LeastSupertypeOnError on_error>
DataTypePtr FieldToDataType<on_error>::operator() (const Object &) const DataTypePtr FieldToDataType<on_error>::operator() (const Object &) const
{ {
/// TODO: Do we need different parameters for type Object? return std::make_shared<DataTypeObject>(DataTypeObject::SchemaFormat::JSON);
return std::make_shared<DataTypeObject>("json", false);
} }
template <LeastSupertypeOnError on_error> template <LeastSupertypeOnError on_error>

View File

@ -363,9 +363,10 @@ bool isArray(TYPE data_type) { return WhichDataType(data_type).isArray(); } \
bool isTuple(TYPE data_type) { return WhichDataType(data_type).isTuple(); } \ bool isTuple(TYPE data_type) { return WhichDataType(data_type).isTuple(); } \
bool isMap(TYPE data_type) {return WhichDataType(data_type).isMap(); } \ bool isMap(TYPE data_type) {return WhichDataType(data_type).isMap(); } \
bool isInterval(TYPE data_type) {return WhichDataType(data_type).isInterval(); } \ bool isInterval(TYPE data_type) {return WhichDataType(data_type).isInterval(); } \
bool isObject(TYPE data_type) { return WhichDataType(data_type).isObject(); } \ bool isObjectDeprecated(TYPE data_type) { return WhichDataType(data_type).isObjectDeprecated(); } \
bool isVariant(TYPE data_type) { return WhichDataType(data_type).isVariant(); } \ bool isVariant(TYPE data_type) { return WhichDataType(data_type).isVariant(); } \
bool isDynamic(TYPE data_type) { return WhichDataType(data_type).isDynamic(); } \ bool isDynamic(TYPE data_type) { return WhichDataType(data_type).isDynamic(); } \
bool isObject(TYPE data_type) { return WhichDataType(data_type).isObject(); } \
bool isNothing(TYPE data_type) { return WhichDataType(data_type).isNothing(); } \ bool isNothing(TYPE data_type) { return WhichDataType(data_type).isNothing(); } \
\ \
bool isColumnedAsNumber(TYPE data_type) \ bool isColumnedAsNumber(TYPE data_type) \

View File

@ -432,7 +432,7 @@ struct WhichDataType
constexpr bool isMap() const {return idx == TypeIndex::Map; } constexpr bool isMap() const {return idx == TypeIndex::Map; }
constexpr bool isSet() const { return idx == TypeIndex::Set; } constexpr bool isSet() const { return idx == TypeIndex::Set; }
constexpr bool isInterval() const { return idx == TypeIndex::Interval; } constexpr bool isInterval() const { return idx == TypeIndex::Interval; }
constexpr bool isObject() const { return idx == TypeIndex::Object; } constexpr bool isObjectDeprecated() const { return idx == TypeIndex::ObjectDeprecated; }
constexpr bool isNothing() const { return idx == TypeIndex::Nothing; } constexpr bool isNothing() const { return idx == TypeIndex::Nothing; }
constexpr bool isNullable() const { return idx == TypeIndex::Nullable; } constexpr bool isNullable() const { return idx == TypeIndex::Nullable; }
@ -444,6 +444,7 @@ struct WhichDataType
constexpr bool isVariant() const { return idx == TypeIndex::Variant; } constexpr bool isVariant() const { return idx == TypeIndex::Variant; }
constexpr bool isDynamic() const { return idx == TypeIndex::Dynamic; } constexpr bool isDynamic() const { return idx == TypeIndex::Dynamic; }
constexpr bool isObject() const { return idx == TypeIndex::Object; }
}; };
/// IDataType helpers (alternative for IDataType virtual methods with single point of truth) /// IDataType helpers (alternative for IDataType virtual methods with single point of truth)
@ -502,9 +503,10 @@ bool isArray(TYPE data_type); \
bool isTuple(TYPE data_type); \ bool isTuple(TYPE data_type); \
bool isMap(TYPE data_type); \ bool isMap(TYPE data_type); \
bool isInterval(TYPE data_type); \ bool isInterval(TYPE data_type); \
bool isObject(TYPE data_type); \ bool isObjectDeprecated(TYPE data_type); \
bool isVariant(TYPE data_type); \ bool isVariant(TYPE data_type); \
bool isDynamic(TYPE data_type); \ bool isDynamic(TYPE data_type); \
bool isObject(TYPE data_type); \
bool isNothing(TYPE data_type); \ bool isNothing(TYPE data_type); \
\ \
bool isColumnedAsNumber(TYPE data_type); \ bool isColumnedAsNumber(TYPE data_type); \

View File

@ -4,7 +4,7 @@
#include <Analyzer/QueryNode.h> #include <Analyzer/QueryNode.h>
#include <Analyzer/Utils.h> #include <Analyzer/Utils.h>
#include <DataTypes/ObjectUtils.h> #include <DataTypes/ObjectUtils.h>
#include <DataTypes/DataTypeObject.h> #include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/DataTypeNothing.h> #include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeArray.h> #include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeMap.h> #include <DataTypes/DataTypeMap.h>
@ -16,7 +16,7 @@
#include <DataTypes/getLeastSupertype.h> #include <DataTypes/getLeastSupertype.h>
#include <DataTypes/NestedUtils.h> #include <DataTypes/NestedUtils.h>
#include <Storages/StorageSnapshot.h> #include <Storages/StorageSnapshot.h>
#include <Columns/ColumnObject.h> #include <Columns/ColumnObjectDeprecated.h>
#include <Columns/ColumnTuple.h> #include <Columns/ColumnTuple.h>
#include <Columns/ColumnArray.h> #include <Columns/ColumnArray.h>
#include <Columns/ColumnMap.h> #include <Columns/ColumnMap.h>
@ -180,12 +180,12 @@ static DataTypePtr recreateTupleWithElements(const DataTypeTuple & type_tuple, c
} }
static std::pair<ColumnPtr, DataTypePtr> convertObjectColumnToTuple( static std::pair<ColumnPtr, DataTypePtr> convertObjectColumnToTuple(
const ColumnObject & column_object, const DataTypeObject & type_object) const ColumnObjectDeprecated & column_object, const DataTypeObjectDeprecated & type_object)
{ {
if (!column_object.isFinalized()) if (!column_object.isFinalized())
{ {
auto finalized = column_object.cloneFinalized(); auto finalized = column_object.cloneFinalized();
const auto & finalized_object = assert_cast<const ColumnObject &>(*finalized); const auto & finalized_object = assert_cast<const ColumnObjectDeprecated &>(*finalized);
return convertObjectColumnToTuple(finalized_object, type_object); return convertObjectColumnToTuple(finalized_object, type_object);
} }
@ -211,9 +211,9 @@ static std::pair<ColumnPtr, DataTypePtr> recursivlyConvertDynamicColumnToTuple(
if (!type->hasDynamicSubcolumnsDeprecated()) if (!type->hasDynamicSubcolumnsDeprecated())
return {column, type}; return {column, type};
if (const auto * type_object = typeid_cast<const DataTypeObject *>(type.get())) if (const auto * type_object = typeid_cast<const DataTypeObjectDeprecated *>(type.get()))
{ {
const auto & column_object = assert_cast<const ColumnObject &>(*column); const auto & column_object = assert_cast<const ColumnObjectDeprecated &>(*column);
return convertObjectColumnToTuple(column_object, *type_object); return convertObjectColumnToTuple(column_object, *type_object);
} }
@ -369,7 +369,7 @@ static DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool che
for (const auto & [key, subtypes] : subcolumns_types) for (const auto & [key, subtypes] : subcolumns_types)
{ {
assert(!subtypes.empty()); assert(!subtypes.empty());
if (key.getPath() == ColumnObject::COLUMN_NAME_DUMMY) if (key.getPath() == ColumnObjectDeprecated::COLUMN_NAME_DUMMY)
continue; continue;
size_t first_dim = getNumberOfDimensions(*subtypes[0]); size_t first_dim = getNumberOfDimensions(*subtypes[0]);
@ -385,7 +385,7 @@ static DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool che
if (tuple_paths.empty()) if (tuple_paths.empty())
{ {
tuple_paths.emplace_back(ColumnObject::COLUMN_NAME_DUMMY); tuple_paths.emplace_back(ColumnObjectDeprecated::COLUMN_NAME_DUMMY);
tuple_types.emplace_back(std::make_shared<DataTypeUInt8>()); tuple_types.emplace_back(std::make_shared<DataTypeUInt8>());
} }
@ -452,7 +452,7 @@ static DataTypePtr getLeastCommonTypeForDynamicColumnsImpl(
if (!type_in_storage->hasDynamicSubcolumnsDeprecated()) if (!type_in_storage->hasDynamicSubcolumnsDeprecated())
return type_in_storage; return type_in_storage;
if (isObject(type_in_storage)) if (isObjectDeprecated(type_in_storage))
return getLeastCommonTypeForObject(concrete_types, check_ambiguos_paths); return getLeastCommonTypeForObject(concrete_types, check_ambiguos_paths);
if (const auto * type_array = typeid_cast<const DataTypeArray *>(type_in_storage.get())) if (const auto * type_array = typeid_cast<const DataTypeArray *>(type_in_storage.get()))
@ -494,9 +494,9 @@ DataTypePtr createConcreteEmptyDynamicColumn(const DataTypePtr & type_in_storage
if (!type_in_storage->hasDynamicSubcolumnsDeprecated()) if (!type_in_storage->hasDynamicSubcolumnsDeprecated())
return type_in_storage; return type_in_storage;
if (isObject(type_in_storage)) if (isObjectDeprecated(type_in_storage))
return std::make_shared<DataTypeTuple>( return std::make_shared<DataTypeTuple>(
DataTypes{std::make_shared<DataTypeUInt8>()}, Names{ColumnObject::COLUMN_NAME_DUMMY}); DataTypes{std::make_shared<DataTypeUInt8>()}, Names{ColumnObjectDeprecated::COLUMN_NAME_DUMMY});
if (const auto * type_array = typeid_cast<const DataTypeArray *>(type_in_storage.get())) if (const auto * type_array = typeid_cast<const DataTypeArray *>(type_in_storage.get()))
return std::make_shared<DataTypeArray>( return std::make_shared<DataTypeArray>(
@ -838,7 +838,7 @@ DataTypePtr unflattenTuple(const PathsInData & paths, const DataTypes & tuple_ty
return unflattenTuple(paths, tuple_types, tuple_columns).second; return unflattenTuple(paths, tuple_types, tuple_columns).second;
} }
std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObject & column) std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObjectDeprecated & column)
{ {
const auto & subcolumns = column.getSubcolumns(); const auto & subcolumns = column.getSubcolumns();
@ -846,7 +846,7 @@ std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObject & co
{ {
auto type = std::make_shared<DataTypeTuple>( auto type = std::make_shared<DataTypeTuple>(
DataTypes{std::make_shared<DataTypeUInt8>()}, DataTypes{std::make_shared<DataTypeUInt8>()},
Names{ColumnObject::COLUMN_NAME_DUMMY}); Names{ColumnObjectDeprecated::COLUMN_NAME_DUMMY});
return {type->createColumn()->cloneResized(column.size()), type}; return {type->createColumn()->cloneResized(column.size()), type};
} }

View File

@ -6,7 +6,7 @@
#include <Storages/ColumnsDescription.h> #include <Storages/ColumnsDescription.h>
#include <DataTypes/DataTypeTuple.h> #include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h> #include <DataTypes/DataTypesNumber.h>
#include <Columns/ColumnObject.h> #include <Columns/ColumnObjectDeprecated.h>
namespace DB namespace DB
{ {
@ -88,7 +88,7 @@ DataTypePtr unflattenTuple(
const PathsInData & paths, const PathsInData & paths,
const DataTypes & tuple_types); const DataTypes & tuple_types);
std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObject & column); std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObjectDeprecated & column);
std::pair<ColumnPtr, DataTypePtr> unflattenTuple( std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
const PathsInData & paths, const PathsInData & paths,

View File

@ -202,6 +202,12 @@ String getNameForSubstreamPath(
stream_name += "." + it->variant_element_name + ".null"; stream_name += "." + it->variant_element_name + ".null";
else if (it->type == SubstreamType::DynamicStructure) else if (it->type == SubstreamType::DynamicStructure)
stream_name += ".dynamic_structure"; stream_name += ".dynamic_structure";
else if (it->type == SubstreamType::ObjectStructure)
stream_name += ".object_structure";
else if (it->type == SubstreamType::ObjectSharedData)
stream_name += ".object_shared_data";
else if (it->type == SubstreamType::ObjectTypedPath || it->type == SubstreamType::ObjectDynamicPath)
stream_name += "." + it->object_path_name;
} }
return stream_name; return stream_name;
@ -401,7 +407,17 @@ bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t pref
|| path[last_elem].type == Substream::TupleElement || path[last_elem].type == Substream::TupleElement
|| path[last_elem].type == Substream::ArraySizes || path[last_elem].type == Substream::ArraySizes
|| path[last_elem].type == Substream::VariantElement || path[last_elem].type == Substream::VariantElement
|| path[last_elem].type == Substream::VariantElementNullMap; || path[last_elem].type == Substream::VariantElementNullMap
|| path[last_elem].type == Substream::ObjectTypedPath;
}
bool ISerialization::isEphemeralSubcolumn(const DB::ISerialization::SubstreamPath & path, size_t prefix_len)
{
if (prefix_len == 0 || prefix_len > path.size())
return false;
size_t last_elem = prefix_len - 1;
return path[last_elem].type == Substream::VariantElementNullMap;
} }
ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len) ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len)

View File

@ -176,8 +176,8 @@ public:
SparseElements, SparseElements,
SparseOffsets, SparseOffsets,
ObjectStructure, DeprecatedObjectStructure,
ObjectData, DeprecatedObjectData,
VariantDiscriminators, VariantDiscriminators,
NamedVariantDiscriminators, NamedVariantDiscriminators,
@ -189,6 +189,12 @@ public:
DynamicData, DynamicData,
DynamicStructure, DynamicStructure,
ObjectData,
ObjectTypedPath,
ObjectDynamicPath,
ObjectSharedData,
ObjectStructure,
Regular, Regular,
}; };
@ -203,6 +209,9 @@ public:
/// Name of substream for type from 'named_types'. /// Name of substream for type from 'named_types'.
String name_of_substream; String name_of_substream;
/// Path name for Object type elements.
String object_path_name;
/// Data for current substream. /// Data for current substream.
SubstreamData data; SubstreamData data;
@ -263,13 +272,13 @@ public:
bool use_compact_variant_discriminators_serialization = false; bool use_compact_variant_discriminators_serialization = false;
enum class DynamicStatisticsMode enum class ObjectAndDynamicStatisticsMode
{ {
NONE, /// Don't write statistics. NONE, /// Don't write statistics.
PREFIX, /// Write statistics in prefix. PREFIX, /// Write statistics in prefix.
SUFFIX, /// Write statistics in suffix. SUFFIX, /// Write statistics in suffix.
}; };
DynamicStatisticsMode dynamic_write_statistics = DynamicStatisticsMode::NONE; ObjectAndDynamicStatisticsMode object_and_dynamic_write_statistics = ObjectAndDynamicStatisticsMode::NONE;
}; };
struct DeserializeBinaryBulkSettings struct DeserializeBinaryBulkSettings
@ -290,7 +299,7 @@ public:
/// If not zero, may be used to avoid reallocations while reading column of String type. /// If not zero, may be used to avoid reallocations while reading column of String type.
double avg_value_size_hint = 0; double avg_value_size_hint = 0;
bool dynamic_read_statistics = false; bool object_and_dynamic_read_statistics = false;
}; };
/// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark. /// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark.
@ -440,6 +449,10 @@ public:
static bool hasSubcolumnForPath(const SubstreamPath & path, size_t prefix_len); static bool hasSubcolumnForPath(const SubstreamPath & path, size_t prefix_len);
static SubstreamData createFromPath(const SubstreamPath & path, size_t prefix_len); static SubstreamData createFromPath(const SubstreamPath & path, size_t prefix_len);
/// Returns true if subcolumn doesn't actually stores any data in column and doesn't require a separate stream
/// for writing/reading data. For example, it's a null-map subcolumn of Variant type (it's always constructed from discriminators);.
static bool isEphemeralSubcolumn(const SubstreamPath & path, size_t prefix_len);
protected: protected:
template <typename State, typename StatePtr> template <typename State, typename StatePtr>
State * checkAndGetState(const StatePtr & state) const; State * checkAndGetState(const StatePtr & state) const;

View File

@ -143,7 +143,7 @@ void SerializationDynamic::serializeBinaryBulkStatePrefix(
} }
/// Write statistics in prefix if needed. /// Write statistics in prefix if needed.
if (settings.dynamic_write_statistics == SerializeBinaryBulkSettings::DynamicStatisticsMode::PREFIX) if (settings.object_and_dynamic_write_statistics == SerializeBinaryBulkSettings::ObjectAndDynamicStatisticsMode::PREFIX)
{ {
const auto & statistics = column_dynamic.getStatistics(); const auto & statistics = column_dynamic.getStatistics();
/// First, write statistics for usual variants. /// First, write statistics for usual variants.
@ -225,8 +225,8 @@ void SerializationDynamic::deserializeBinaryBulkStatePrefix(
return; return;
auto dynamic_state = std::make_shared<DeserializeBinaryBulkStateDynamic>(); auto dynamic_state = std::make_shared<DeserializeBinaryBulkStateDynamic>();
dynamic_state->structure_state = structure_state; dynamic_state->structure_state = std::move(structure_state);
dynamic_state->variant_serialization = checkAndGetState<DeserializeBinaryBulkStateDynamicStructure>(structure_state)->variant_type->getDefaultSerialization(); dynamic_state->variant_serialization = checkAndGetState<DeserializeBinaryBulkStateDynamicStructure>(dynamic_state->structure_state)->variant_type->getDefaultSerialization();
settings.path.push_back(Substream::DynamicData); settings.path.push_back(Substream::DynamicData);
dynamic_state->variant_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_state->variant_state, cache); dynamic_state->variant_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_state->variant_state, cache);
@ -243,7 +243,7 @@ ISerialization::DeserializeBinaryBulkStatePtr SerializationDynamic::deserializeD
DeserializeBinaryBulkStatePtr state = nullptr; DeserializeBinaryBulkStatePtr state = nullptr;
if (auto cached_state = getFromSubstreamsDeserializeStatesCache(cache, settings.path)) if (auto cached_state = getFromSubstreamsDeserializeStatesCache(cache, settings.path))
{ {
state = cached_state; state = std::move(cached_state);
} }
else if (auto * structure_stream = settings.getter(settings.path)) else if (auto * structure_stream = settings.getter(settings.path))
{ {
@ -277,16 +277,12 @@ ISerialization::DeserializeBinaryBulkStatePtr SerializationDynamic::deserializeD
auto variant_type = std::make_shared<DataTypeVariant>(variants); auto variant_type = std::make_shared<DataTypeVariant>(variants);
/// Read statistics. /// Read statistics.
if (settings.dynamic_read_statistics) if (settings.object_and_dynamic_read_statistics)
{ {
ColumnDynamic::Statistics statistics(ColumnDynamic::Statistics::Source::READ); ColumnDynamic::Statistics statistics(ColumnDynamic::Statistics::Source::READ);
/// First, read statistics for usual variants. /// First, read statistics for usual variants.
size_t variant_size;
for (const auto & variant : variant_type->getVariants()) for (const auto & variant : variant_type->getVariants())
{ readVarUInt(statistics.variants_statistics[variant->getName()], *structure_stream);
readVarUInt(variant_size, *structure_stream);
statistics.variants_statistics[variant->getName()] = variant_size;
}
/// Second, read statistics for shared variants. /// Second, read statistics for shared variants.
size_t statistics_size; size_t statistics_size;
@ -295,8 +291,7 @@ ISerialization::DeserializeBinaryBulkStatePtr SerializationDynamic::deserializeD
for (size_t i = 0; i != statistics_size; ++i) for (size_t i = 0; i != statistics_size; ++i)
{ {
readStringBinary(variant_name, *structure_stream); readStringBinary(variant_name, *structure_stream);
readVarUInt(variant_size, *structure_stream); readVarUInt(statistics.shared_variants_statistics[variant_name], *structure_stream);
statistics.shared_variants_statistics[variant_name] = variant_size;
} }
structure_state->statistics = std::make_shared<const ColumnDynamic::Statistics>(std::move(statistics)); structure_state->statistics = std::make_shared<const ColumnDynamic::Statistics>(std::move(statistics));
@ -320,10 +315,10 @@ void SerializationDynamic::serializeBinaryBulkStateSuffix(
settings.path.pop_back(); settings.path.pop_back();
if (!stream) if (!stream)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for Dynamic column structure during serialization of binary bulk state prefix"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for Dynamic column structure during serialization of binary bulk state suffix");
/// Write statistics in suffix if needed. /// Write statistics in suffix if needed.
if (settings.dynamic_write_statistics == SerializeBinaryBulkSettings::DynamicStatisticsMode::SUFFIX) if (settings.object_and_dynamic_write_statistics == SerializeBinaryBulkSettings::ObjectAndDynamicStatisticsMode::SUFFIX)
{ {
/// First, write statistics for usual variants. /// First, write statistics for usual variants.
for (const auto & variant_name : dynamic_state->variant_names) for (const auto & variant_name : dynamic_state->variant_names)
@ -348,6 +343,18 @@ void SerializationDynamic::serializeBinaryBulkWithMultipleStreams(
size_t limit, size_t limit,
SerializeBinaryBulkSettings & settings, SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const SerializeBinaryBulkStatePtr & state) const
{
size_t tmp_size;
serializeBinaryBulkWithMultipleStreamsAndCountTotalSizeOfVariants(column, offset, limit, settings, state, tmp_size);
}
void SerializationDynamic::serializeBinaryBulkWithMultipleStreamsAndCountTotalSizeOfVariants(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state,
size_t & total_size_of_variants) const
{ {
const auto & column_dynamic = assert_cast<const ColumnDynamic &>(column); const auto & column_dynamic = assert_cast<const ColumnDynamic &>(column);
auto * dynamic_state = checkAndGetState<SerializeBinaryBulkStateDynamic>(state); auto * dynamic_state = checkAndGetState<SerializeBinaryBulkStateDynamic>(state);
@ -361,10 +368,18 @@ void SerializationDynamic::serializeBinaryBulkWithMultipleStreams(
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mismatch of max_dynamic_types parameter of Dynamic. Expected: {}, Got: {}", dynamic_state->max_dynamic_types, column_dynamic.getMaxDynamicTypes()); throw Exception(ErrorCodes::LOGICAL_ERROR, "Mismatch of max_dynamic_types parameter of Dynamic. Expected: {}, Got: {}", dynamic_state->max_dynamic_types, column_dynamic.getMaxDynamicTypes());
settings.path.push_back(Substream::DynamicData); settings.path.push_back(Substream::DynamicData);
assert_cast<const SerializationVariant &>(*dynamic_state->variant_serialization)
.serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(
*variant_column,
offset,
limit,
settings,
dynamic_state->variant_state,
dynamic_state->statistics.variants_statistics,
total_size_of_variants);
if (dynamic_state->recalculate_statistics) if (dynamic_state->recalculate_statistics)
{ {
assert_cast<const SerializationVariant &>(*dynamic_state->variant_serialization)
.serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(*variant_column, offset, limit, settings, dynamic_state->variant_state, dynamic_state->statistics.variants_statistics);
/// Calculate statistics for shared variants. /// Calculate statistics for shared variants.
const auto & shared_variant = column_dynamic.getSharedVariant(); const auto & shared_variant = column_dynamic.getSharedVariant();
if (!shared_variant.empty()) if (!shared_variant.empty())
@ -389,10 +404,6 @@ void SerializationDynamic::serializeBinaryBulkWithMultipleStreams(
} }
} }
} }
else
{
assert_cast<const SerializationVariant &>(*dynamic_state->variant_serialization).serializeBinaryBulkWithMultipleStreams(*variant_column, offset, limit, settings, dynamic_state->variant_state);
}
settings.path.pop_back(); settings.path.pop_back();
} }
@ -753,6 +764,12 @@ void SerializationDynamic::serializeTextJSON(const IColumn & column, size_t row_
serializeTextImpl(column, row_num, ostr, settings, nested_serialize); serializeTextImpl(column, row_num, ostr, settings, nested_serialize);
} }
void SerializationDynamic::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const
{
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(column);
dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextJSONPretty(dynamic_column.getVariantColumn(), row_num, ostr, settings, indent);
}
void SerializationDynamic::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const void SerializationDynamic::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{ {
auto read_field = [&settings](ReadBuffer & buf) auto read_field = [&settings](ReadBuffer & buf)

View File

@ -1,6 +1,7 @@
#pragma once #pragma once
#include <DataTypes/Serializations/ISerialization.h> #include <DataTypes/Serializations/ISerialization.h>
#include <DataTypes/DataTypeDynamic.h>
#include <Columns/ColumnDynamic.h> #include <Columns/ColumnDynamic.h>
namespace DB namespace DB
@ -11,7 +12,7 @@ class SerializationDynamicElement;
class SerializationDynamic : public ISerialization class SerializationDynamic : public ISerialization
{ {
public: public:
explicit SerializationDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_) explicit SerializationDynamic(size_t max_dynamic_types_ = DataTypeDynamic::DEFAULT_MAX_DYNAMIC_TYPES) : max_dynamic_types(max_dynamic_types_)
{ {
} }
@ -59,6 +60,14 @@ public:
SerializeBinaryBulkSettings & settings, SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override; SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreamsAndCountTotalSizeOfVariants(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state,
size_t & total_size_of_variants) const;
void deserializeBinaryBulkWithMultipleStreams( void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column, ColumnPtr & column,
size_t limit, size_t limit,
@ -89,6 +98,7 @@ public:
bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;

View File

@ -53,6 +53,7 @@ void SerializationDynamicElement::enumerateStreams(
.withColumn(data.column) .withColumn(data.column)
.withSerializationInfo(data.serialization_info) .withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state->variant_element_state); .withDeserializeState(deserialize_state->variant_element_state);
settings.path.back().data = variant_data;
deserialize_state->variant_serialization->enumerateStreams(settings, callback, variant_data); deserialize_state->variant_serialization->enumerateStreams(settings, callback, variant_data);
settings.path.pop_back(); settings.path.pop_back();
} }

View File

@ -0,0 +1,405 @@
#include <DataTypes/Serializations/SerializationJSON.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#if USE_SIMDJSON
#include <Common/JSONParsers/SimdJSONParser.h>
#endif
#if USE_RAPIDJSON
#include <Common/JSONParsers/RapidJSONParser.h>
#endif
#include <Common/JSONParsers/DummyJSONParser.h>
namespace DB
{
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
}
template <typename Parser>
SerializationJSON<Parser>::SerializationJSON(
std::unordered_map<String, SerializationPtr> typed_paths_serializations_,
const std::unordered_set<String> & paths_to_skip_,
const std::vector<String> & path_regexps_to_skip_,
std::unique_ptr<JSONExtractTreeNode<Parser>> json_extract_tree_)
: SerializationObject(std::move(typed_paths_serializations_), paths_to_skip_, path_regexps_to_skip_)
, json_extract_tree(std::move(json_extract_tree_))
{
}
namespace
{
/// Struct that represents elements of the JSON path.
/// "a.b.c" -> ["a", "b", "c"]
struct PathElements
{
explicit PathElements(const String & path)
{
const char * start = path.data();
const char * end = start + path.size();
const char * pos = start;
const char * last_dot_pos = pos - 1;
for (pos = start; pos != end; ++pos)
{
if (*pos == '.')
{
elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1));
last_dot_pos = pos;
}
}
elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1));
}
size_t size() const { return elements.size(); }
std::vector<std::string_view> elements;
};
/// Struct that represents a prefix of a JSON path. Used during output of the JSON object.
struct Prefix
{
/// Shrink current prefix to the common prefix of current prefix and specified path.
/// For example, if current prefix is a.b.c.d and path is a.b.e, then shrink the prefix to a.b.
void shrinkToCommonPrefix(const PathElements & path_elements)
{
/// Don't include last element in path_elements in the prefix.
size_t i = 0;
while (i != elements.size() && i != (path_elements.elements.size() - 1) && elements[i].first == path_elements.elements[i])
++i;
elements.resize(i);
}
/// Check is_first flag in current object.
bool isFirstInCurrentObject() const
{
if (elements.empty())
return root_is_first_flag;
return elements.back().second;
}
/// Set flag is_first = false in current object.
void setNotFirstInCurrentObject()
{
if (elements.empty())
root_is_first_flag = false;
else
elements.back().second = false;
}
size_t size() const { return elements.size(); }
/// Elements of the prefix: (path element, is_first flag in this prefix).
/// is_first flag indicates if we already serialized some key in the object with such prefix.
std::vector<std::pair<std::string_view, bool>> elements;
bool root_is_first_flag = true;
};
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, bool pretty, size_t indent) const
{
const auto & column_object = assert_cast<const ColumnObject &>(column);
const auto & typed_paths = column_object.getTypedPaths();
const auto & dynamic_paths = column_object.getDynamicPaths();
const auto & shared_data_offsets = column_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = column_object.getSharedDataPathsAndValues();
size_t shared_data_offset = shared_data_offsets[static_cast<ssize_t>(row_num) - 1];
size_t shared_data_end = shared_data_offsets[static_cast<ssize_t>(row_num)];
/// We need to convert the set of paths in this row to a JSON object.
/// To do it, we first collect all the paths from current row, then we sort them
/// and construct the resulting JSON object by iterating over sorted list of paths.
/// For example:
/// b.c, a.b, a.a, b.e, g, h.u.t -> a.a, a.b, b.c, b.e, g, h.u.t -> {"a" : {"a" : ..., "b" : ...}, "b" : {"c" : ..., "e" : ...}, "g" : ..., "h" : {"u" : {"t" : ...}}}.
std::vector<String> sorted_paths;
sorted_paths.reserve(typed_paths.size() + dynamic_paths.size() + (shared_data_end - shared_data_offset));
for (const auto & [path, _] : typed_paths)
sorted_paths.emplace_back(path);
for (const auto & [path, dynamic_column] : dynamic_paths)
{
/// We consider null value and absence of the path in a row as equivalent cases, because we cannot actually distinguish them.
/// So, we don't output null values at all.
if (!dynamic_column->isNullAt(row_num))
sorted_paths.emplace_back(path);
}
for (size_t i = shared_data_offset; i != shared_data_end; ++i)
{
auto path = shared_data_paths->getDataAt(i).toString();
sorted_paths.emplace_back(path);
}
std::sort(sorted_paths.begin(), sorted_paths.end());
if (pretty)
writeCString("{\n", ostr);
else
writeChar('{', ostr);
size_t index_in_shared_data_values = shared_data_offset;
/// current_prefix represents the path of the object we are currently serializing keys in.
Prefix current_prefix;
for (const auto & path : sorted_paths)
{
PathElements path_elements(path);
/// Change prefix to common prefix between current prefix and current path.
/// If prefix changed (it can only decrease), close all finished objects.
/// For example:
/// Current prefix: a.b.c.d
/// Current path: a.b.e.f
/// It means now we have : {..., "a" : {"b" : {"c" : {"d" : ...
/// Common prefix will be a.b, so it means we should close objects a.b.c.d and a.b.c: {..., "a" : {"b" : {"c" : {"d" : ...}}
/// and continue serializing keys in object a.b
size_t prev_prefix_size = current_prefix.size();
current_prefix.shrinkToCommonPrefix(path_elements);
size_t prefix_size = current_prefix.size();
if (prefix_size != prev_prefix_size)
{
size_t objects_to_close = prev_prefix_size - prefix_size;
if (pretty)
{
writeChar('\n', ostr);
for (size_t i = 0; i != objects_to_close; ++i)
{
writeChar(' ', (indent + prefix_size + objects_to_close - i) * 4, ostr);
if (i != objects_to_close - 1)
writeCString("}\n", ostr);
else
writeChar('}', ostr);
}
}
else
{
for (size_t i = 0; i != objects_to_close; ++i)
writeChar('}', ostr);
}
}
/// Now we are inside object that has common prefix with current path.
/// We should go inside all objects in current path.
/// From the example above we should open object a.b.e:
/// {..., "a" : {"b" : {"c" : {"d" : ...}}, "e" : {
if (prefix_size + 1 < path_elements.size())
{
for (size_t i = prefix_size; i != path_elements.size() - 1; ++i)
{
/// Write comma before the key if it's not the first key in this prefix.
if (!current_prefix.isFirstInCurrentObject())
{
if (pretty)
writeCString(",\n", ostr);
else
writeChar(',', ostr);
}
else
{
current_prefix.setNotFirstInCurrentObject();
}
if (pretty)
{
writeChar(' ', (indent + i + 1) * 4, ostr);
writeJSONString(path_elements.elements[i], ostr, settings);
writeCString(" : {\n", ostr);
}
else
{
writeJSONString(path_elements.elements[i], ostr, settings);
writeCString(":{", ostr);
}
/// Update current prefix.
current_prefix.elements.emplace_back(path_elements.elements[i], true);
}
}
/// Write comma before the key if it's not the first key in this prefix.
if (!current_prefix.isFirstInCurrentObject())
{
if (pretty)
writeCString(",\n", ostr);
else
writeChar(',', ostr);
}
else
{
current_prefix.setNotFirstInCurrentObject();
}
if (pretty)
{
writeChar(' ', (indent + current_prefix.size() + 1) * 4, ostr);
writeJSONString(path_elements.elements.back(), ostr, settings);
writeCString(" : ", ostr);
}
else
{
writeJSONString(path_elements.elements.back(), ostr, settings);
writeCString(":", ostr);
}
/// Serialize value of current path.
if (auto typed_it = typed_paths.find(path); typed_it != typed_paths.end())
{
if (pretty)
typed_path_serializations.at(path)->serializeTextJSONPretty(*typed_it->second, row_num, ostr, settings, indent + current_prefix.size() + 1);
else
typed_path_serializations.at(path)->serializeTextJSON(*typed_it->second, row_num, ostr, settings);
}
else if (auto dynamic_it = dynamic_paths.find(path); dynamic_it != dynamic_paths.end())
{
if (pretty)
dynamic_serialization->serializeTextJSONPretty(*dynamic_it->second, row_num, ostr, settings, indent + current_prefix.size() + 1);
else
dynamic_serialization->serializeTextJSON(*dynamic_it->second, row_num, ostr, settings);
}
else
{
/// To serialize value stored in shared data we should first deserialize it from binary format.
auto tmp_dynamic_column = ColumnDynamic::create();
tmp_dynamic_column->reserve(1);
column_object.deserializeValueFromSharedData(shared_data_values, index_in_shared_data_values++, *tmp_dynamic_column);
if (pretty)
dynamic_serialization->serializeTextJSONPretty(*tmp_dynamic_column, 0, ostr, settings, indent + current_prefix.size() + 1);
else
dynamic_serialization->serializeTextJSON(*tmp_dynamic_column, 0, ostr, settings);
}
}
/// Close all remaining open objects.
if (pretty)
{
writeChar('\n', ostr);
for (size_t i = 0; i != current_prefix.elements.size(); ++i)
{
writeChar(' ', (indent + current_prefix.size() - i) * 4, ostr);
writeCString("}\n", ostr);
}
writeChar(' ', indent * 4, ostr);
writeChar('}', ostr);
}
else
{
for (size_t i = 0; i != current_prefix.elements.size(); ++i)
writeChar('}', ostr);
writeChar('}', ostr);
}
}
template <typename Parser>
void SerializationJSON<Parser>::deserializeTextImpl(IColumn & column, std::string_view object, const FormatSettings & settings) const
{
typename Parser::Element document;
auto parser = parsers_pool.get([] { return new Parser; });
if (!parser->parse(object, document))
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object here: {}", object);
String error;
if (!json_extract_tree->insertResultToColumn(column, document, insert_settings, settings, error))
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot insert data into JSON column: {}", error);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeTextImpl(column, row_num, ostr, settings);
}
template <typename Parser>
void SerializationJSON<Parser>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String object;
readStringUntilEOF(object, istr);
deserializeTextImpl(column, object, settings);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString buf;
serializeTextImpl(column, row_num, buf, settings);
writeEscapedString(buf.str(), ostr);
}
template <typename Parser>
void SerializationJSON<Parser>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String object;
readEscapedString(object, istr);
deserializeTextImpl(column, object, settings);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString buf;
serializeTextImpl(column, row_num, buf, settings);
writeQuotedString(buf.str(), ostr);
}
template <typename Parser>
void SerializationJSON<Parser>::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String object;
readQuotedString(object, istr);
deserializeTextImpl(column, object, settings);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString buf;
serializeTextImpl(column, row_num, buf, settings);
writeCSVString(buf.str(), ostr);
}
template <typename Parser>
void SerializationJSON<Parser>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String object;
readCSVString(object, istr, settings.csv);
deserializeTextImpl(column, object, settings);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString buf;
serializeTextImpl(column, row_num, buf, settings);
writeXMLStringForTextElement(buf.str(), ostr);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeTextImpl(column, row_num, ostr, settings);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const
{
serializeTextImpl(column, row_num, ostr, settings, true, indent);
}
template <typename Parser>
void SerializationJSON<Parser>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String object_buffer;
auto object_view = readJSONObjectAsViewPossiblyInvalid(istr, object_buffer);
deserializeTextImpl(column, object_view, settings);
}
#if USE_SIMDJSON
template class SerializationJSON<SimdJSONParser>;
#endif
#if USE_RAPIDJSON
template class SerializationJSON<RapidJSONParser>;
#else
template class SerializationJSON<DummyJSONParser>;
#endif
}

View File

@ -0,0 +1,49 @@
#pragma once
#include <DataTypes/Serializations/SerializationObject.h>
#include <Formats/JSONExtractTree.h>
#include <Common/ObjectPool.h>
namespace DB
{
/// Class for text serialization/deserialization of the JSON data type.
template <typename Parser>
class SerializationJSON : public SerializationObject
{
public:
SerializationJSON(
std::unordered_map<String, SerializationPtr> typed_paths_serializations_,
const std::unordered_set<String> & paths_to_skip_,
const std::vector<String> & path_regexps_to_skip_,
std::unique_ptr<JSONExtractTreeNode<Parser>> json_extract_tree_);
void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
private:
void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, bool pretty = false, size_t indent = 0) const;
void deserializeTextImpl(IColumn & column, std::string_view object, const FormatSettings & settings) const;
std::unique_ptr<JSONExtractTreeNode<Parser>> json_extract_tree;
JSONExtractInsertSettings insert_settings;
/// Pool of parser objects to make SerializationJSON thread safe.
mutable SimpleObjectPool<Parser> parsers_pool;
};
}

View File

@ -268,9 +268,16 @@ void SerializationLowCardinality::serializeBinaryBulkStateSuffix(
void SerializationLowCardinality::deserializeBinaryBulkStatePrefix( void SerializationLowCardinality::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state, DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * /*cache*/) const SubstreamsDeserializeStatesCache * cache) const
{ {
settings.path.push_back(Substream::DictionaryKeys); settings.path.push_back(Substream::DictionaryKeys);
if (auto cached_state = getFromSubstreamsDeserializeStatesCache(cache, settings.path))
{
state = std::move(cached_state);
return;
}
auto * stream = settings.getter(settings.path); auto * stream = settings.getter(settings.path);
settings.path.pop_back(); settings.path.pop_back();

File diff suppressed because it is too large Load Diff

View File

@ -1,34 +1,43 @@
#pragma once #pragma once
#include <Columns/ColumnObject.h> #include <Columns/ColumnObject.h>
#include <DataTypes/Serializations/SimpleTextSerialization.h> #include <DataTypes/DataTypeObject.h>
#include <Common/ObjectPool.h> #include <list>
namespace DB namespace DB
{ {
/** Serialization for data type Object. class SerializationObjectDynamicPath;
* Supported only text serialization/deserialization. class SerializationSubObject;
* and binary bulk serialization/deserialization without position independent
* encoding, i.e. serialization/deserialization into Native format. /// Class for binary serialization/deserialization of an Object type (currently only JSON).
*/
template <typename Parser>
class SerializationObject : public ISerialization class SerializationObject : public ISerialization
{ {
public: public:
/** In Native format ColumnObject can be serialized /// Serialization can change in future. Let's introduce serialization version.
* in two formats: as Tuple or as String. struct ObjectSerializationVersion
* The format is the following: {
* enum Value
* <serialization_kind> 1 byte -- 0 if Tuple, 1 if String. {
* [type_name] -- Only for tuple serialization. BASIC = 0,
* ... data of internal column ... };
*
* ClickHouse client serializazes objects as tuples. Value value;
* String serialization exists for clients, which cannot
* do parsing by themselves and they can send raw data as static void checkVersion(UInt64 version);
* string. It will be parsed on the server side.
*/ explicit ObjectSerializationVersion(UInt64 version);
};
SerializationObject(
std::unordered_map<String, SerializationPtr> typed_path_serializations_,
const std::unordered_set<String> & paths_to_skip_,
const std::vector<String> & path_regexps_to_skip_);
void enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix( void serializeBinaryBulkStatePrefix(
const IColumn & column, const IColumn & column,
@ -63,59 +72,55 @@ public:
void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override;
void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; static void restoreColumnObject(ColumnObject & column_object, size_t prev_size);
void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override;
void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
private: private:
enum class BinarySerializationKind : UInt8 friend SerializationObjectDynamicPath;
friend SerializationSubObject;
/// State of an Object structure. Can be also used during deserializing of Object subcolumns.
struct DeserializeBinaryBulkStateObjectStructure : public ISerialization::DeserializeBinaryBulkState
{ {
TUPLE = 0, ObjectSerializationVersion structure_version;
STRING = 1, size_t max_dynamic_paths;
std::vector<String> sorted_dynamic_paths;
std::unordered_set<String> dynamic_paths;
/// Paths statistics. Map (dynamic path) -> (number of non-null values in this path).
ColumnObject::StatisticsPtr statistics;
explicit DeserializeBinaryBulkStateObjectStructure(UInt64 structure_version_) : structure_version(structure_version_) {}
}; };
struct SerializeStateObject; static DeserializeBinaryBulkStatePtr deserializeObjectStructureStatePrefix(
struct DeserializeStateObject;
void deserializeBinaryBulkFromString(
ColumnObject & column_object,
size_t limit,
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkSettings & settings,
DeserializeStateObject & state, SubstreamsDeserializeStatesCache * cache);
SubstreamsCache * cache) const;
void deserializeBinaryBulkFromTuple( /// Shared data has type Array(Tuple(String, String)).
ColumnObject & column_object, static const DataTypePtr & getTypeOfSharedData();
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeStateObject & state,
SubstreamsCache * cache) const;
template <typename TSettings> struct TypedPathSubcolumnCreator : public ISubcolumnCreator
void checkSerializationIsSupported(const TSettings & settings) const; {
String path;
template <typename Reader> explicit TypedPathSubcolumnCreator(const String & path_) : path(path_) {}
void deserializeTextImpl(IColumn & column, Reader && reader) const;
void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; DataTypePtr create(const DataTypePtr & prev) const override { return prev; }
ColumnPtr create(const ColumnPtr & prev) const override { return prev; }
SerializationPtr create(const SerializationPtr & prev) const override;
};
template <bool pretty_json = false> protected:
void serializeTextFromSubcolumn(const ColumnObject::Subcolumn & subcolumn, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent = 0) const; bool shouldSkipPath(const String & path) const;
/// Pool of parser objects to make SerializationObject thread safe. std::unordered_map<String, SerializationPtr> typed_path_serializations;
mutable SimpleObjectPool<Parser> parsers_pool; std::unordered_set<String> paths_to_skip;
std::vector<String> sorted_paths_to_skip;
std::list<re2::RE2> path_regexps_to_skip;
SerializationPtr dynamic_serialization;
private:
std::vector<String> sorted_typed_paths;
SerializationPtr shared_data_serialization;
}; };
SerializationPtr getObjectSerialization(const String & schema_format);
} }

View File

@ -0,0 +1,586 @@
#include <DataTypes/Serializations/SerializationObjectDeprecated.h>
#include <DataTypes/Serializations/JSONDataParser.h>
#include <DataTypes/Serializations/SerializationString.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/ObjectUtils.h>
#include <DataTypes/DataTypeFactory.h>
#include <Common/JSONParsers/SimdJSONParser.h>
#include <Common/JSONParsers/RapidJSONParser.h>
#include <Columns/ColumnObjectDeprecated.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnString.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/VarInt.h>
#include <magic_enum.hpp>
#include <memory>
#include <string>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
extern const int INCORRECT_DATA;
extern const int CANNOT_READ_ALL_DATA;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int CANNOT_PARSE_TEXT;
extern const int EXPERIMENTAL_FEATURE_ERROR;
}
template <typename Parser>
template <typename Reader>
void SerializationObjectDeprecated<Parser>::deserializeTextImpl(IColumn & column, Reader && reader) const
{
auto & column_object = assert_cast<ColumnObjectDeprecated &>(column);
String buf;
reader(buf);
std::optional<ParseResult> result;
/// Treat empty string as an empty object
/// for better CAST from String to Object.
if (!buf.empty())
{
auto parser = parsers_pool.get([] { return new Parser; });
result = parser->parse(buf.data(), buf.size());
}
else
{
result = ParseResult{};
}
if (!result)
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse object");
auto & [paths, values] = *result;
assert(paths.size() == values.size());
size_t old_column_size = column_object.size();
for (size_t i = 0; i < paths.size(); ++i)
{
auto field_info = getFieldInfo(values[i]);
if (field_info.need_fold_dimension)
values[i] = applyVisitor(FieldVisitorFoldDimension(field_info.num_dimensions), std::move(values[i]));
if (isNothing(field_info.scalar_type))
continue;
if (!column_object.hasSubcolumn(paths[i]))
{
if (paths[i].hasNested())
column_object.addNestedSubcolumn(paths[i], field_info, old_column_size);
else
column_object.addSubcolumn(paths[i], old_column_size);
}
auto & subcolumn = column_object.getSubcolumn(paths[i]);
assert(subcolumn.size() == old_column_size);
subcolumn.insert(std::move(values[i]), std::move(field_info));
}
/// Insert default values to missed subcolumns.
const auto & subcolumns = column_object.getSubcolumns();
for (const auto & entry : subcolumns)
{
if (entry->data.size() == old_column_size)
{
bool inserted = column_object.tryInsertDefaultFromNested(entry);
if (!inserted)
entry->data.insertDefault();
}
}
column_object.incrementNumRows();
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
deserializeTextImpl(column, [&](String & s) { readStringInto(s, istr); });
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
deserializeTextImpl(column, [&](String & s) { settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(s, istr) : readEscapedString(s, istr); });
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
deserializeTextImpl(column, [&](String & s) { readQuotedStringInto<true>(s, istr); });
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
deserializeTextImpl(column, [&](String & s) { Parser::readJSON(s, istr); });
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
deserializeTextImpl(column, [&](String & s) { readCSVStringInto(s, istr, settings.csv); });
}
template <typename Parser>
template <typename TSettings>
void SerializationObjectDeprecated<Parser>::checkSerializationIsSupported(const TSettings & settings) const
{
if (settings.position_independent_encoding)
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"DataTypeObject doesn't support serialization with position independent encoding");
}
template <typename Parser>
struct SerializationObjectDeprecated<Parser>::SerializeStateObject : public ISerialization::SerializeBinaryBulkState
{
DataTypePtr nested_type;
SerializationPtr nested_serialization;
SerializeBinaryBulkStatePtr nested_state;
};
template <typename Parser>
struct SerializationObjectDeprecated<Parser>::DeserializeStateObject : public ISerialization::DeserializeBinaryBulkState
{
BinarySerializationKind kind;
DataTypePtr nested_type;
SerializationPtr nested_serialization;
DeserializeBinaryBulkStatePtr nested_state;
};
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
checkSerializationIsSupported(settings);
if (state)
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"DataTypeObject doesn't support serialization with non-trivial state");
const auto & column_object = assert_cast<const ColumnObjectDeprecated &>(column);
if (!column_object.isFinalized())
{
auto finalized = column_object.cloneFinalized();
serializeBinaryBulkStatePrefix(*finalized, settings, state);
return;
}
settings.path.push_back(Substream::DeprecatedObjectStructure);
auto * stream = settings.getter(settings.path);
if (!stream)
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Missing stream for kind of binary serialization");
auto [tuple_column, tuple_type] = unflattenObjectToTuple(column_object);
writeIntBinary(static_cast<UInt8>(BinarySerializationKind::TUPLE), *stream);
writeStringBinary(tuple_type->getName(), *stream);
auto state_object = std::make_shared<SerializeStateObject>();
state_object->nested_type = tuple_type;
state_object->nested_serialization = tuple_type->getDefaultSerialization();
settings.path.back() = Substream::DeprecatedObjectData;
state_object->nested_serialization->serializeBinaryBulkStatePrefix(*tuple_column, settings, state_object->nested_state);
state = std::move(state_object);
settings.path.pop_back();
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
checkSerializationIsSupported(settings);
auto * state_object = checkAndGetState<SerializeStateObject>(state);
settings.path.push_back(Substream::DeprecatedObjectData);
state_object->nested_serialization->serializeBinaryBulkStateSuffix(settings, state_object->nested_state);
settings.path.pop_back();
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const
{
checkSerializationIsSupported(settings);
if (state)
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"DataTypeObject doesn't support serialization with non-trivial state");
settings.path.push_back(Substream::DeprecatedObjectStructure);
auto * stream = settings.getter(settings.path);
settings.path.pop_back();
if (!stream)
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA,
"Cannot read kind of binary serialization of DataTypeObject, because its stream is missing");
UInt8 kind_raw;
readIntBinary(kind_raw, *stream);
auto kind = magic_enum::enum_cast<BinarySerializationKind>(kind_raw);
if (!kind)
throw Exception(ErrorCodes::INCORRECT_DATA,
"Unknown binary serialization kind of Object: {}", std::to_string(kind_raw));
auto state_object = std::make_shared<DeserializeStateObject>();
state_object->kind = *kind;
if (state_object->kind == BinarySerializationKind::TUPLE)
{
String data_type_name;
readStringBinary(data_type_name, *stream);
state_object->nested_type = DataTypeFactory::instance().get(data_type_name);
state_object->nested_serialization = state_object->nested_type->getDefaultSerialization();
if (!isTuple(state_object->nested_type))
throw Exception(ErrorCodes::INCORRECT_DATA,
"Data of type Object should be written as Tuple, got: {}", data_type_name);
}
else if (state_object->kind == BinarySerializationKind::STRING)
{
state_object->nested_type = std::make_shared<DataTypeString>();
state_object->nested_serialization = std::make_shared<SerializationString>();
}
else
{
throw Exception(ErrorCodes::INCORRECT_DATA,
"Unknown binary serialization kind of Object: {}", std::to_string(kind_raw));
}
settings.path.push_back(Substream::DeprecatedObjectData);
state_object->nested_serialization->deserializeBinaryBulkStatePrefix(settings, state_object->nested_state, cache);
settings.path.pop_back();
state = std::move(state_object);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
checkSerializationIsSupported(settings);
const auto & column_object = assert_cast<const ColumnObjectDeprecated &>(column);
auto * state_object = checkAndGetState<SerializeStateObject>(state);
if (!column_object.isFinalized())
{
auto finalized = column_object.cloneFinalized();
serializeBinaryBulkWithMultipleStreams(*finalized, offset, limit, settings, state);
return;
}
auto [tuple_column, tuple_type] = unflattenObjectToTuple(column_object);
if (!state_object->nested_type->equals(*tuple_type))
{
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR,
"Types of internal column of Object mismatched. Expected: {}, Got: {}",
state_object->nested_type->getName(), tuple_type->getName());
}
settings.path.push_back(Substream::DeprecatedObjectData);
if (auto * stream = settings.getter(settings.path))
{
state_object->nested_serialization->serializeBinaryBulkWithMultipleStreams(
*tuple_column, offset, limit, settings, state_object->nested_state);
}
settings.path.pop_back();
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
checkSerializationIsSupported(settings);
if (!column->empty())
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"DataTypeObject cannot be deserialized to non-empty column");
auto mutable_column = column->assumeMutable();
auto & column_object = assert_cast<ColumnObjectDeprecated &>(*mutable_column);
auto * state_object = checkAndGetState<DeserializeStateObject>(state);
settings.path.push_back(Substream::DeprecatedObjectData);
if (state_object->kind == BinarySerializationKind::STRING)
deserializeBinaryBulkFromString(column_object, limit, settings, *state_object, cache);
else
deserializeBinaryBulkFromTuple(column_object, limit, settings, *state_object, cache);
settings.path.pop_back();
column_object.checkConsistency();
column_object.finalize();
column = std::move(mutable_column);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeBinaryBulkFromString(
ColumnObjectDeprecated & column_object,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeStateObject & state,
SubstreamsCache * cache) const
{
ColumnPtr column_string = state.nested_type->createColumn();
state.nested_serialization->deserializeBinaryBulkWithMultipleStreams(
column_string, limit, settings, state.nested_state, cache);
size_t input_rows_count = column_string->size();
column_object.reserve(input_rows_count);
FormatSettings format_settings;
for (size_t i = 0; i < input_rows_count; ++i)
{
const auto & val = column_string->getDataAt(i);
ReadBufferFromMemory read_buffer(val.data, val.size);
deserializeWholeText(column_object, read_buffer, format_settings);
if (!read_buffer.eof())
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT,
"Cannot parse string to column Object. Expected eof");
}
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeBinaryBulkFromTuple(
ColumnObjectDeprecated & column_object,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeStateObject & state,
SubstreamsCache * cache) const
{
ColumnPtr column_tuple = state.nested_type->createColumn();
state.nested_serialization->deserializeBinaryBulkWithMultipleStreams(
column_tuple, limit, settings, state.nested_state, cache);
auto [tuple_paths, tuple_types] = flattenTuple(state.nested_type);
auto flattened_tuple = flattenTuple(column_tuple);
const auto & tuple_columns = assert_cast<const ColumnTuple &>(*flattened_tuple).getColumns();
assert(tuple_paths.size() == tuple_types.size());
size_t num_subcolumns = tuple_paths.size();
if (tuple_columns.size() != num_subcolumns)
throw Exception(ErrorCodes::INCORRECT_DATA,
"Inconsistent type ({}) and column ({}) while reading column of type Object",
state.nested_type->getName(), column_tuple->getName());
for (size_t i = 0; i < num_subcolumns; ++i)
column_object.addSubcolumn(tuple_paths[i], tuple_columns[i]->assumeMutable());
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeBinary(const Field &, WriteBuffer &, const FormatSettings &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObjectDeprecated");
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeBinary(Field &, ReadBuffer &, const FormatSettings &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObjectDeprecated");
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeBinary(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObjectDeprecated");
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeBinary(IColumn &, ReadBuffer &, const FormatSettings &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObjectDeprecated");
}
/// TODO: use format different of JSON in serializations.
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const auto & column_object = assert_cast<const ColumnObjectDeprecated &>(column);
const auto & subcolumns = column_object.getSubcolumns();
writeChar('{', ostr);
for (auto it = subcolumns.begin(); it != subcolumns.end(); ++it)
{
const auto & entry = *it;
if (it != subcolumns.begin())
writeCString(",", ostr);
writeDoubleQuoted(entry->path.getPath(), ostr);
writeChar(':', ostr);
serializeTextFromSubcolumn(entry->data, row_num, ostr, settings);
}
writeChar('}', ostr);
}
template <typename Parser>
template <bool pretty_json>
void SerializationObjectDeprecated<Parser>::serializeTextFromSubcolumn(
const ColumnObjectDeprecated::Subcolumn & subcolumn, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const
{
const auto & least_common_type = subcolumn.getLeastCommonType();
if (subcolumn.isFinalized())
{
const auto & finalized_column = subcolumn.getFinalizedColumn();
auto info = least_common_type->getSerializationInfo(finalized_column);
auto serialization = least_common_type->getSerialization(*info);
if constexpr (pretty_json)
serialization->serializeTextJSONPretty(finalized_column, row_num, ostr, settings, indent);
else
serialization->serializeTextJSON(finalized_column, row_num, ostr, settings);
return;
}
size_t ind = row_num;
if (ind < subcolumn.getNumberOfDefaultsInPrefix())
{
/// Suboptimal, but it should happen rarely.
auto tmp_column = subcolumn.getLeastCommonType()->createColumn();
tmp_column->insertDefault();
auto info = least_common_type->getSerializationInfo(*tmp_column);
auto serialization = least_common_type->getSerialization(*info);
if constexpr (pretty_json)
serialization->serializeTextJSONPretty(*tmp_column, 0, ostr, settings, indent);
else
serialization->serializeTextJSON(*tmp_column, 0, ostr, settings);
return;
}
ind -= subcolumn.getNumberOfDefaultsInPrefix();
for (const auto & part : subcolumn.getData())
{
if (ind < part->size())
{
auto part_type = getDataTypeByColumn(*part);
auto info = part_type->getSerializationInfo(*part);
auto serialization = part_type->getSerialization(*info);
if constexpr (pretty_json)
serialization->serializeTextJSONPretty(*part, ind, ostr, settings, indent);
else
serialization->serializeTextJSON(*part, ind, ostr, settings);
return;
}
ind -= part->size();
}
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Index ({}) for text serialization is out of range", row_num);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeTextImpl(column, row_num, ostr, settings);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString ostr_str;
serializeTextImpl(column, row_num, ostr_str, settings);
writeEscapedString(ostr_str.str(), ostr);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString ostr_str;
serializeTextImpl(column, row_num, ostr_str, settings);
writeQuotedString(ostr_str.str(), ostr);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeTextImpl(column, row_num, ostr, settings);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString ostr_str;
serializeTextImpl(column, row_num, ostr_str, settings);
writeCSVString(ostr_str.str(), ostr);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextMarkdown(
const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
if (settings.markdown.escape_special_characters)
{
WriteBufferFromOwnString ostr_str;
serializeTextImpl(column, row_num, ostr_str, settings);
writeMarkdownEscapedString(ostr_str.str(), ostr);
}
else
{
serializeTextEscaped(column, row_num, ostr, settings);
}
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const
{
const auto & column_object = assert_cast<const ColumnObjectDeprecated &>(column);
const auto & subcolumns = column_object.getSubcolumns();
writeCString("{\n", ostr);
for (auto it = subcolumns.begin(); it != subcolumns.end(); ++it)
{
const auto & entry = *it;
if (it != subcolumns.begin())
writeCString(",\n", ostr);
writeChar(' ', (indent + 1) * 4, ostr);
writeDoubleQuoted(entry->path.getPath(), ostr);
writeCString(": ", ostr);
serializeTextFromSubcolumn<true>(entry->data, row_num, ostr, settings, indent + 1);
}
writeChar('\n', ostr);
writeChar(' ', indent * 4, ostr);
writeChar('}', ostr);
}
SerializationPtr getObjectSerialization(const String & schema_format)
{
if (schema_format == "json")
{
#if USE_SIMDJSON
return std::make_shared<SerializationObjectDeprecated<JSONDataParser<SimdJSONParser>>>();
#elif USE_RAPIDJSON
return std::make_shared<SerializationObjectDeprecated<JSONDataParser<RapidJSONParser>>>();
#else
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"To use data type Object with JSON format ClickHouse should be built with Simdjson or Rapidjson");
#endif
}
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unknown schema format '{}'", schema_format);
}
}

View File

@ -0,0 +1,121 @@
#pragma once
#include <Columns/ColumnObjectDeprecated.h>
#include <DataTypes/Serializations/SimpleTextSerialization.h>
#include <Common/ObjectPool.h>
namespace DB
{
/** Serialization for data type Object (deprecated).
* Supported only text serialization/deserialization.
* and binary bulk serialization/deserialization without position independent
* encoding, i.e. serialization/deserialization into Native format.
*/
template <typename Parser>
class SerializationObjectDeprecated : public ISerialization
{
public:
/** In Native format ColumnObjectDeprecated can be serialized
* in two formats: as Tuple or as String.
* The format is the following:
*
* <serialization_kind> 1 byte -- 0 if Tuple, 1 if String.
* [type_name] -- Only for tuple serialization.
* ... data of internal column ...
*
* ClickHouse client serializazes objects as tuples.
* String serialization exists for clients, which cannot
* do parsing by themselves and they can send raw data as
* string. It will be parsed on the server side.
*/
void serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override;
void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override;
void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override;
void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override;
void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
private:
enum class BinarySerializationKind : UInt8
{
TUPLE = 0,
STRING = 1,
};
struct SerializeStateObject;
struct DeserializeStateObject;
void deserializeBinaryBulkFromString(
ColumnObjectDeprecated & column_object,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeStateObject & state,
SubstreamsCache * cache) const;
void deserializeBinaryBulkFromTuple(
ColumnObjectDeprecated & column_object,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeStateObject & state,
SubstreamsCache * cache) const;
template <typename TSettings>
void checkSerializationIsSupported(const TSettings & settings) const;
template <typename Reader>
void deserializeTextImpl(IColumn & column, Reader && reader) const;
void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const;
template <bool pretty_json = false>
void serializeTextFromSubcolumn(const ColumnObjectDeprecated::Subcolumn & subcolumn, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent = 0) const;
/// Pool of parser objects to make SerializationObjectDeprecated thread safe.
mutable SimpleObjectPool<Parser> parsers_pool;
};
SerializationPtr getObjectSerialization(const String & schema_format);
}

View File

@ -0,0 +1,192 @@
#include <Columns/ColumnDynamic.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/Serializations/SerializationObject.h>
#include <DataTypes/Serializations/SerializationObjectDynamicPath.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
SerializationObjectDynamicPath::SerializationObjectDynamicPath(
const DB::SerializationPtr & nested_, const String & path_, const String & path_subcolumn_, size_t max_dynamic_types_)
: SerializationWrapper(nested_)
, path(path_)
, path_subcolumn(path_subcolumn_)
, dynamic_serialization(std::make_shared<SerializationDynamic>())
, shared_data_serialization(SerializationObject::getTypeOfSharedData()->getDefaultSerialization())
, max_dynamic_types(max_dynamic_types_)
{
}
struct DeserializeBinaryBulkStateObjectDynamicPath : public ISerialization::DeserializeBinaryBulkState
{
ISerialization::DeserializeBinaryBulkStatePtr structure_state;
ISerialization::DeserializeBinaryBulkStatePtr nested_state;
bool read_from_shared_data;
ColumnPtr shared_data;
};
void SerializationObjectDynamicPath::enumerateStreams(
DB::ISerialization::EnumerateStreamsSettings & settings,
const DB::ISerialization::StreamCallback & callback,
const DB::ISerialization::SubstreamData & data) const
{
settings.path.push_back(Substream::ObjectStructure);
callback(settings.path);
settings.path.pop_back();
const auto * deserialize_state = data.deserialize_state ? checkAndGetState<DeserializeBinaryBulkStateObjectDynamicPath>(data.deserialize_state) : nullptr;
/// We cannot enumerate anything if we don't have deserialization state, as we don't know the dynamic structure.
if (!deserialize_state)
return;
settings.path.push_back(Substream::ObjectData);
const auto * structure_state = checkAndGetState<SerializationObject::DeserializeBinaryBulkStateObjectStructure>(deserialize_state->structure_state);
/// Check if we have our path in dynamic paths.
if (structure_state->dynamic_paths.contains(path))
{
settings.path.push_back(Substream::ObjectDynamicPath);
settings.path.back().object_path_name = path;
auto path_data = SubstreamData(nested_serialization)
.withType(data.type)
.withColumn(data.column)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state->nested_state);
settings.path.back().data = path_data;
nested_serialization->enumerateStreams(settings, callback, path_data);
settings.path.pop_back();
}
/// Otherwise we will have to read all shared data and try to find our path there.
else
{
settings.path.push_back(Substream::ObjectSharedData);
auto shared_data_substream_data = SubstreamData(shared_data_serialization)
.withType(data.type ? SerializationObject::getTypeOfSharedData() : nullptr)
.withColumn(data.column ? SerializationObject::getTypeOfSharedData()->createColumn() : nullptr)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state->nested_state);
settings.path.back().data = shared_data_substream_data;
shared_data_serialization->enumerateStreams(settings, callback, shared_data_substream_data);
settings.path.pop_back();
}
settings.path.pop_back();
}
void SerializationObjectDynamicPath::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationObjectDynamicPath");
}
void SerializationObjectDynamicPath::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationObjectDynamicPath");
}
void SerializationObjectDynamicPath::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const
{
auto structure_state = SerializationObject::deserializeObjectStructureStatePrefix(settings, cache);
if (!structure_state)
return;
auto dynamic_path_state = std::make_shared<DeserializeBinaryBulkStateObjectDynamicPath>();
dynamic_path_state->structure_state = std::move(structure_state);
/// Remember if we need to read from shared data or we have this path in dynamic paths.
dynamic_path_state->read_from_shared_data = !checkAndGetState<SerializationObject::DeserializeBinaryBulkStateObjectStructure>(dynamic_path_state->structure_state)->dynamic_paths.contains(path);
settings.path.push_back(Substream::ObjectData);
if (dynamic_path_state->read_from_shared_data)
{
settings.path.push_back(Substream::ObjectSharedData);
shared_data_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_path_state->nested_state, cache);
settings.path.pop_back();
}
else
{
settings.path.push_back(Substream::ObjectDynamicPath);
settings.path.back().object_path_name = path;
nested_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_path_state->nested_state, cache);
settings.path.pop_back();
}
settings.path.pop_back();
state = std::move(dynamic_path_state);
}
void SerializationObjectDynamicPath::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationObjectDynamicPath");
}
void SerializationObjectDynamicPath::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & result_column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
if (!state)
return;
auto * dynamic_path_state = checkAndGetState<DeserializeBinaryBulkStateObjectDynamicPath>(state);
settings.path.push_back(Substream::ObjectData);
/// Check if we don't need to read shared data. In this case just read data from dynamic path.
if (!dynamic_path_state->read_from_shared_data)
{
settings.path.push_back(Substream::ObjectDynamicPath);
settings.path.back().object_path_name = path;
nested_serialization->deserializeBinaryBulkWithMultipleStreams(result_column, limit, settings, dynamic_path_state->nested_state, cache);
settings.path.pop_back();
}
/// Otherwise, read the whole shared data column and extract requested path from it.
/// TODO: We can read several subcolumns of the same path located in the shared data
/// and right now we extract the whole path column from shared data every time
/// and then extract the requested subcolumns. We can optimize it and use substreams
/// cache here to avoid extracting the same path from shared data several times.
///
/// TODO: We can change the serialization of shared data to optimize reading paths from it.
/// Right now we cannot know if shared data contains our path in current range or not,
/// but we can change the serialization and write the list of all paths stored in shared
/// data before each granule, and then replace the column that stores paths with column
/// with indexes in this list. It can also reduce the storage, because we will store
/// each path only once and can replace UInt64 string offset column with indexes column
/// that can have smaller type depending on the number of paths in the list.
else
{
settings.path.push_back(Substream::ObjectSharedData);
/// Initialize shared_data column if needed.
if (result_column->empty())
dynamic_path_state->shared_data = SerializationObject::getTypeOfSharedData()->createColumn();
size_t prev_size = result_column->size();
shared_data_serialization->deserializeBinaryBulkWithMultipleStreams(dynamic_path_state->shared_data, limit, settings, dynamic_path_state->nested_state, cache);
/// If we need to read a subcolumn from Dynamic column, create an empty Dynamic column, fill it and extract subcolumn.
MutableColumnPtr dynamic_column = path_subcolumn.empty() ? result_column->assumeMutable() : ColumnDynamic::create(max_dynamic_types)->getPtr();
/// Check if we don't have any paths in shared data in current range.
const auto & offsets = assert_cast<const ColumnArray &>(*dynamic_path_state->shared_data).getOffsets();
if (offsets.back() == offsets[ssize_t(prev_size) - 1])
dynamic_column->insertManyDefaults(limit);
else
ColumnObject::fillPathColumnFromSharedData(*dynamic_column, path, dynamic_path_state->shared_data, prev_size, dynamic_path_state->shared_data->size());
/// Extract subcolumn from Dynamic column if needed.
if (!path_subcolumn.empty())
{
auto subcolumn = std::make_shared<DataTypeDynamic>(max_dynamic_types)->getSubcolumn(path_subcolumn, dynamic_column->getPtr());
result_column->assumeMutable()->insertRangeFrom(*subcolumn, 0, subcolumn->size());
}
settings.path.pop_back();
}
settings.path.pop_back();
}
}

View File

@ -0,0 +1,58 @@
#pragma once
#include <DataTypes/Serializations/SerializationWrapper.h>
namespace DB
{
/// Serialization of dynamic Object paths.
/// For example, if we have type JSON(a.b UInt32, b.c String) and data {"a" : {"b" : 42}, "b" : {"c" : "Hello}, "c" : {"d" : [1, 2, 3]}, "d" : 42}
/// this class will be responsible for reading dynamic paths 'c.d' and 'd' as subcolumns.
/// Typed paths 'a.b' and 'b.c' are serialized in SerializationObjectTypedPath.
class SerializationObjectDynamicPath final : public SerializationWrapper
{
public:
SerializationObjectDynamicPath(const SerializationPtr & nested_, const String & path_, const String & path_subcolumn_, size_t max_dynamic_types_);
void enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
private:
String path;
String path_subcolumn;
SerializationPtr dynamic_serialization;
SerializationPtr shared_data_serialization;
size_t max_dynamic_types;
};
}

View File

@ -0,0 +1,78 @@
#include <Columns/ColumnDynamic.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/Serializations/SerializationObject.h>
#include <DataTypes/Serializations/SerializationObjectTypedPath.h>
#include <IO/ReadHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
void SerializationObjectTypedPath::enumerateStreams(
DB::ISerialization::EnumerateStreamsSettings & settings,
const DB::ISerialization::StreamCallback & callback,
const DB::ISerialization::SubstreamData & data) const
{
settings.path.push_back(Substream::ObjectData);
settings.path.push_back(Substream::ObjectTypedPath);
settings.path.back().object_path_name = path;
auto path_data = SubstreamData(nested_serialization)
.withType(data.type)
.withColumn(data.column)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(data.deserialize_state);
nested_serialization->enumerateStreams(settings, callback, path_data);
settings.path.pop_back();
settings.path.pop_back();
}
void SerializationObjectTypedPath::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationObjectTypedPath");
}
void SerializationObjectTypedPath::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationObjectTypedPath");
}
void SerializationObjectTypedPath::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const
{
settings.path.push_back(Substream::ObjectData);
settings.path.push_back(Substream::ObjectTypedPath);
settings.path.back().object_path_name = path;
nested_serialization->deserializeBinaryBulkStatePrefix(settings, state, cache);
settings.path.pop_back();
settings.path.pop_back();
}
void SerializationObjectTypedPath::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationObjectTypedPath");
}
void SerializationObjectTypedPath::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & result_column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
settings.path.push_back(Substream::ObjectData);
settings.path.push_back(Substream::ObjectTypedPath);
settings.path.back().object_path_name = path;
nested_serialization->deserializeBinaryBulkWithMultipleStreams(result_column, limit, settings, state, cache);
settings.path.pop_back();
settings.path.pop_back();
}
}

View File

@ -0,0 +1,57 @@
#pragma once
#include <DataTypes/Serializations/SerializationWrapper.h>
namespace DB
{
/// Serialization of typed Object paths.
/// For example, for type JSON(a.b UInt32, b.c String) this serialization
/// will be used to read paths 'a.b' and 'b.c' as subcolumns.
class SerializationObjectTypedPath final : public SerializationWrapper
{
public:
SerializationObjectTypedPath(const SerializationPtr & nested_, const String & path_)
: SerializationWrapper(nested_)
, path(path_)
{
}
void enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
private:
String path;
};
}

View File

@ -0,0 +1,259 @@
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/Serializations/SerializationObject.h>
#include <DataTypes/Serializations/SerializationSubObject.h>
#include <Common/logger_useful.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
SerializationSubObject::SerializationSubObject(
const String & path_prefix_, const std::unordered_map<String, SerializationPtr> & typed_paths_serializations_)
: path_prefix(path_prefix_)
, typed_paths_serializations(typed_paths_serializations_)
, dynamic_serialization(std::make_shared<SerializationDynamic>())
, shared_data_serialization(SerializationObject::getTypeOfSharedData()->getDefaultSerialization())
{
}
struct DeserializeBinaryBulkStateSubObject : public ISerialization::DeserializeBinaryBulkState
{
std::unordered_map<String, ISerialization::DeserializeBinaryBulkStatePtr> typed_path_states;
std::unordered_map<String, ISerialization::DeserializeBinaryBulkStatePtr> dynamic_path_states;
std::vector<String> dynamic_paths;
std::vector<String> dynamic_sub_paths;
ISerialization::DeserializeBinaryBulkStatePtr shared_data_state;
ColumnPtr shared_data;
};
void SerializationSubObject::enumerateStreams(
DB::ISerialization::EnumerateStreamsSettings & settings,
const DB::ISerialization::StreamCallback & callback,
const DB::ISerialization::SubstreamData & data) const
{
settings.path.push_back(Substream::ObjectStructure);
callback(settings.path);
settings.path.pop_back();
const auto * column_object = data.column ? &assert_cast<const ColumnObject &>(*data.column) : nullptr;
const auto * type_object = data.type ? &assert_cast<const DataTypeObject &>(*data.type) : nullptr;
const auto * deserialize_state = data.deserialize_state ? checkAndGetState<DeserializeBinaryBulkStateSubObject>(data.deserialize_state) : nullptr;
settings.path.push_back(Substream::ObjectData);
/// typed_paths_serializations contains only typed paths with requested prefix from original Object column.
for (const auto & [path, serialization] : typed_paths_serializations)
{
settings.path.push_back(Substream::ObjectTypedPath);
settings.path.back().object_path_name = path;
auto path_data = SubstreamData(serialization)
.withType(type_object ? type_object->getTypedPaths().at(path.substr(path_prefix.size() + 1)) : nullptr)
.withColumn(column_object ? column_object->getTypedPaths().at(path.substr(path_prefix.size() + 1)) : nullptr)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state ? deserialize_state->typed_path_states.at(path) : nullptr);
settings.path.back().data = path_data;
serialization->enumerateStreams(settings, callback, path_data);
settings.path.pop_back();
}
/// We will need to read shared data to find all paths with requested prefix.
settings.path.push_back(Substream::ObjectSharedData);
auto shared_data_substream_data = SubstreamData(shared_data_serialization)
.withType(data.type ? SerializationObject::getTypeOfSharedData() : nullptr)
.withColumn(data.column ? SerializationObject::getTypeOfSharedData()->createColumn() : nullptr)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state ? deserialize_state->shared_data_state : nullptr);
settings.path.back().data = shared_data_substream_data;
shared_data_serialization->enumerateStreams(settings, callback, shared_data_substream_data);
settings.path.pop_back();
/// If deserialize state is provided, enumerate streams for dynamic paths.
if (deserialize_state)
{
DataTypePtr type = std::make_shared<DataTypeDynamic>();
for (const auto & [path, state] : deserialize_state->dynamic_path_states)
{
settings.path.push_back(Substream::ObjectDynamicPath);
settings.path.back().object_path_name = path;
auto path_data = SubstreamData(dynamic_serialization)
.withType(type_object ? type : nullptr)
.withColumn(nullptr)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(state);
settings.path.back().data = path_data;
dynamic_serialization->enumerateStreams(settings, callback, path_data);
settings.path.pop_back();
}
}
settings.path.pop_back();
}
void SerializationSubObject::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationSubObject");
}
void SerializationSubObject::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationSubObject");
}
namespace
{
/// Return sub-path by specified prefix.
/// For example, for prefix a.b:
/// a.b.c.d -> c.d, a.b.c -> c
String getSubPath(const String & path, const String & prefix)
{
return path.substr(prefix.size() + 1);
}
std::string_view getSubPath(const std::string_view & path, const String & prefix)
{
return path.substr(prefix.size() + 1);
}
}
void SerializationSubObject::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const
{
auto structure_state = SerializationObject::deserializeObjectStructureStatePrefix(settings, cache);
if (!structure_state)
return;
auto sub_object_state = std::make_shared<DeserializeBinaryBulkStateSubObject>();
settings.path.push_back(Substream::ObjectData);
for (const auto & [path, serialization] : typed_paths_serializations)
{
settings.path.push_back(Substream::ObjectTypedPath);
settings.path.back().object_path_name = path;
serialization->deserializeBinaryBulkStatePrefix(settings, sub_object_state->typed_path_states[path], cache);
settings.path.pop_back();
}
for (const auto & dynamic_path : checkAndGetState<SerializationObject::DeserializeBinaryBulkStateObjectStructure>(structure_state)->sorted_dynamic_paths)
{
/// Save only dynamic paths with requested prefix.
if (dynamic_path.starts_with(path_prefix) && dynamic_path.size() != path_prefix.size())
{
settings.path.push_back(Substream::ObjectDynamicPath);
settings.path.back().object_path_name = dynamic_path;
dynamic_serialization->deserializeBinaryBulkStatePrefix(settings, sub_object_state->dynamic_path_states[dynamic_path], cache);
settings.path.pop_back();
sub_object_state->dynamic_paths.push_back(dynamic_path);
sub_object_state->dynamic_sub_paths.push_back(getSubPath(dynamic_path, path_prefix));
}
}
settings.path.push_back(Substream::ObjectSharedData);
shared_data_serialization->deserializeBinaryBulkStatePrefix(settings, sub_object_state->shared_data_state, cache);
settings.path.pop_back();
settings.path.pop_back();
state = std::move(sub_object_state);
}
void SerializationSubObject::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationSubObject");
}
void SerializationSubObject::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & result_column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
if (!state)
return;
auto * sub_object_state = checkAndGetState<DeserializeBinaryBulkStateSubObject>(state);
auto mutable_column = result_column->assumeMutable();
auto & column_object = assert_cast<ColumnObject &>(*mutable_column);
/// If it's a new object column, set dynamic paths and statistics.
if (column_object.empty())
column_object.setDynamicPaths(sub_object_state->dynamic_sub_paths);
auto & typed_paths = column_object.getTypedPaths();
auto & dynamic_paths = column_object.getDynamicPaths();
settings.path.push_back(Substream::ObjectData);
for (const auto & [path, serialization] : typed_paths_serializations)
{
settings.path.push_back(Substream::ObjectTypedPath);
settings.path.back().object_path_name = path;
serialization->deserializeBinaryBulkWithMultipleStreams(typed_paths[getSubPath(path, path_prefix)], limit, settings, sub_object_state->typed_path_states[path], cache);
settings.path.pop_back();
}
for (const auto & path : sub_object_state->dynamic_paths)
{
settings.path.push_back(Substream::ObjectDynamicPath);
settings.path.back().object_path_name = path;
dynamic_serialization->deserializeBinaryBulkWithMultipleStreams(dynamic_paths[getSubPath(path, path_prefix)], limit, settings, sub_object_state->dynamic_path_states[path], cache);
settings.path.pop_back();
}
settings.path.push_back(Substream::ObjectSharedData);
/// If it's a new object column, reinitialize column for shared data.
if (result_column->empty())
sub_object_state->shared_data = SerializationObject::getTypeOfSharedData()->createColumn();
size_t prev_size = column_object.size();
shared_data_serialization->deserializeBinaryBulkWithMultipleStreams(sub_object_state->shared_data, limit, settings, sub_object_state->shared_data_state, cache);
settings.path.pop_back();
auto & sub_object_shared_data = column_object.getSharedDataColumn();
const auto & offsets = assert_cast<const ColumnArray &>(*sub_object_state->shared_data).getOffsets();
/// Check if there is no data in shared data in current range.
if (offsets.back() == offsets[ssize_t(prev_size) - 1])
{
sub_object_shared_data.insertManyDefaults(limit);
}
else
{
const auto & shared_data_array = assert_cast<const ColumnArray &>(*sub_object_state->shared_data);
const auto & shared_data_offsets = shared_data_array.getOffsets();
const auto & shared_data_tuple = assert_cast<const ColumnTuple &>(shared_data_array.getData());
const auto & shared_data_paths = assert_cast<const ColumnString &>(shared_data_tuple.getColumn(0));
const auto & shared_data_values = assert_cast<const ColumnString &>(shared_data_tuple.getColumn(1));
auto & sub_object_data_offsets = column_object.getSharedDataOffsets();
auto [sub_object_shared_data_paths, sub_object_shared_data_values] = column_object.getSharedDataPathsAndValues();
StringRef prefix_ref(path_prefix);
for (size_t i = prev_size; i != shared_data_offsets.size(); ++i)
{
size_t start = shared_data_offsets[ssize_t(i) - 1];
size_t end = shared_data_offsets[ssize_t(i)];
size_t lower_bound_index = ColumnObject::findPathLowerBoundInSharedData(prefix_ref, shared_data_paths, start, end);
for (; lower_bound_index != end; ++lower_bound_index)
{
auto path = shared_data_paths.getDataAt(lower_bound_index).toView();
if (!path.starts_with(path_prefix))
break;
/// Don't include path that is equal to the prefix.
if (path.size() != path_prefix.size())
{
auto sub_path = getSubPath(path, path_prefix);
sub_object_shared_data_paths->insertData(sub_path.data(), sub_path.size());
sub_object_shared_data_values->insertFrom(shared_data_values, lower_bound_index);
}
}
sub_object_data_offsets.push_back(sub_object_shared_data_paths->size());
}
}
settings.path.pop_back();
}
}

View File

@ -0,0 +1,76 @@
#pragma once
#include <DataTypes/Serializations/ISerialization.h>
#include <DataTypes/Serializations/SimpleTextSerialization.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
/// Serialization of a sub-object Object subcolumns.
/// For example, if we have type JSON and data {"a" : {"b" : {"c" : 42, "d" : "Hello"}}, "c" : [1, 2, 3], "d" : 42}
/// this class will be responsible for reading sub-object a.b and will read JSON column with data {"c" : 43, "d" : "Hello"}.
class SerializationSubObject final : public SimpleTextSerialization
{
public:
SerializationSubObject(const String & path_prefix_, const std::unordered_map<String, SerializationPtr> & typed_paths_serializations_);
void enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
void serializeBinary(const Field &, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void deserializeBinary(Field &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void serializeBinary(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void deserializeBinary(IColumn &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void serializeText(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void deserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); }
bool tryDeserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); }
private:
[[noreturn]] static void throwNoSerialization()
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Text/binary serialization is not implemented for object sub-object subcolumn");
}
String path_prefix;
std::unordered_map<String, SerializationPtr> typed_paths_serializations;
SerializationPtr dynamic_serialization;
SerializationPtr shared_data_serialization;
};
}

View File

@ -218,7 +218,8 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
size_t limit, size_t limit,
SerializeBinaryBulkSettings & settings, SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state, SerializeBinaryBulkStatePtr & state,
std::unordered_map<String, size_t> & variants_statistics) const std::unordered_map<String, size_t> & variants_statistics,
size_t & total_size_of_variants) const
{ {
const ColumnVariant & col = assert_cast<const ColumnVariant &>(column); const ColumnVariant & col = assert_cast<const ColumnVariant &>(column);
if (const size_t size = col.size(); limit == 0 || offset + limit > size) if (const size_t size = col.size(); limit == 0 || offset + limit > size)
@ -265,6 +266,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
/// We can use the same offset/limit as for whole Variant column /// We can use the same offset/limit as for whole Variant column
variants[non_empty_global_discr]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(non_empty_global_discr), offset, limit, settings, variant_state->variant_states[non_empty_global_discr]); variants[non_empty_global_discr]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(non_empty_global_discr), offset, limit, settings, variant_state->variant_states[non_empty_global_discr]);
variants_statistics[variant_names[non_empty_global_discr]] += limit; variants_statistics[variant_names[non_empty_global_discr]] += limit;
total_size_of_variants += limit;
settings.path.pop_back(); settings.path.pop_back();
settings.path.pop_back(); settings.path.pop_back();
return; return;
@ -315,7 +317,9 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
{ {
addVariantElementToPath(settings.path, i); addVariantElementToPath(settings.path, i);
variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->variant_states[i]); variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->variant_states[i]);
variants_statistics[variant_names[i]] += col.getVariantByGlobalDiscriminator(i).size(); size_t variant_size = col.getVariantByGlobalDiscriminator(i).size();
variants_statistics[variant_names[i]] += variant_size;
total_size_of_variants += variant_size;
settings.path.pop_back(); settings.path.pop_back();
} }
settings.path.pop_back(); settings.path.pop_back();
@ -386,6 +390,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
settings, settings,
variant_state->variant_states[i]); variant_state->variant_states[i]);
variants_statistics[variant_names[i]] += variant_offsets_and_limits[i].second; variants_statistics[variant_names[i]] += variant_offsets_and_limits[i].second;
total_size_of_variants += variant_offsets_and_limits[i].second;
settings.path.pop_back(); settings.path.pop_back();
} }
} }
@ -400,7 +405,8 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams(
DB::ISerialization::SerializeBinaryBulkStatePtr & state) const DB::ISerialization::SerializeBinaryBulkStatePtr & state) const
{ {
std::unordered_map<String, size_t> tmp_statistics; std::unordered_map<String, size_t> tmp_statistics;
serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(column, offset, limit, settings, state, tmp_statistics); size_t tmp_size;
serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(column, offset, limit, settings, state, tmp_statistics, tmp_size);
} }
void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( void SerializationVariant::deserializeBinaryBulkWithMultipleStreams(
@ -1068,6 +1074,16 @@ void SerializationVariant::serializeTextJSON(const IColumn & column, size_t row_
variants[global_discr]->serializeTextJSON(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); variants[global_discr]->serializeTextJSON(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings);
} }
void SerializationVariant::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const
{
const ColumnVariant & col = assert_cast<const ColumnVariant &>(column);
auto global_discr = col.globalDiscriminatorAt(row_num);
if (global_discr == ColumnVariant::NULL_DISCRIMINATOR)
SerializationNullable::serializeNullJSON(ostr);
else
variants[global_discr]->serializeTextJSONPretty(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings, indent);
}
bool SerializationVariant::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const bool SerializationVariant::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{ {
String field; String field;

View File

@ -113,7 +113,8 @@ public:
size_t limit, size_t limit,
SerializeBinaryBulkSettings & settings, SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state, SerializeBinaryBulkStatePtr & state,
std::unordered_map<String, size_t> & variants_statistics) const; std::unordered_map<String, size_t> & variants_statistics,
size_t & total_size_of_variants) const;
void deserializeBinaryBulkWithMultipleStreams( void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column, ColumnPtr & column,
@ -145,6 +146,7 @@ public:
bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;

View File

@ -193,16 +193,6 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams(
nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, *variant_limit, settings, variant_element_state->variant_element_state, cache); nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, *variant_limit, settings, variant_element_state->variant_element_state, cache);
removeVariantFromPath(settings.path); removeVariantFromPath(settings.path);
/// If nothing was deserialized when variant_limit > 0
/// it means that we don't have a stream for such sub-column.
/// It may happen during ALTER MODIFY column with Variant extension.
/// In this case we should just insert default values.
if (variant_element_state->variant->empty())
{
mutable_column->insertManyDefaults(num_new_discriminators);
return;
}
/// If there was nothing to deserialize or nothing was actually deserialized when variant_limit > 0, just insert defaults. /// If there was nothing to deserialize or nothing was actually deserialized when variant_limit > 0, just insert defaults.
/// The second case means that we don't have a stream for such sub-column. It may happen during ALTER MODIFY column with Variant extension. /// The second case means that we don't have a stream for such sub-column. It may happen during ALTER MODIFY column with Variant extension.
if (variant_limit == 0 || variant_element_state->variant->empty()) if (variant_limit == 0 || variant_element_state->variant->empty())

View File

@ -0,0 +1,80 @@
#include <Columns/ColumnObjectDeprecated.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/Serializations/SerializationObjectDeprecated.h>
#include <DataTypes/Serializations/SerializationString.h>
#include <IO/ReadBufferFromString.h>
#include <IO/WriteBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <Common/FieldVisitorToString.h>
#include <gtest/gtest.h>
#if USE_SIMDJSON
using namespace DB;
TEST(SerializationObjectDeprecated, FromString)
{
WriteBufferFromOwnString out;
auto column_string = ColumnString::create();
column_string->insert(R"({"k1" : 1, "k2" : [{"k3" : "aa", "k4" : 2}, {"k3": "bb", "k4": 3}]})");
column_string->insert(R"({"k1" : 2, "k2" : [{"k3" : "cc", "k5" : 4}, {"k4": 5}, {"k4": 6}]})");
{
auto serialization = std::make_shared<SerializationString>();
ISerialization::SerializeBinaryBulkSettings settings;
ISerialization::SerializeBinaryBulkStatePtr state;
settings.position_independent_encoding = false;
settings.getter = [&out](const auto &) { return &out; };
writeIntBinary(static_cast<UInt8>(1), out);
serialization->serializeBinaryBulkStatePrefix(*column_string, settings, state);
serialization->serializeBinaryBulkWithMultipleStreams(*column_string, 0, column_string->size(), settings, state);
serialization->serializeBinaryBulkStateSuffix(settings, state);
}
auto type_object = std::make_shared<DataTypeObjectDeprecated>("json", false);
ColumnPtr result_column = type_object->createColumn();
ReadBufferFromOwnString in(out.str());
{
auto serialization = type_object->getDefaultSerialization();
ISerialization::DeserializeBinaryBulkSettings settings;
ISerialization::DeserializeBinaryBulkStatePtr state;
settings.position_independent_encoding = false;
settings.getter = [&in](const auto &) { return &in; };
serialization->deserializeBinaryBulkStatePrefix(settings, state, nullptr);
serialization->deserializeBinaryBulkWithMultipleStreams(result_column, column_string->size(), settings, state, nullptr);
}
auto & column_object = assert_cast<ColumnObjectDeprecated &>(*result_column->assumeMutable());
column_object.finalize();
ASSERT_TRUE(column_object.size() == 2);
ASSERT_TRUE(column_object.getSubcolumns().size() == 4);
auto check_subcolumn = [&](const auto & name, const auto & type_name, const std::vector<Field> & expected)
{
const auto & subcolumn = column_object.getSubcolumn(PathInData{name});
ASSERT_EQ(subcolumn.getLeastCommonType()->getName(), type_name);
const auto & data = subcolumn.getFinalizedColumn();
for (size_t i = 0; i < expected.size(); ++i)
ASSERT_EQ(
applyVisitor(FieldVisitorToString(), data[i]),
applyVisitor(FieldVisitorToString(), expected[i]));
};
check_subcolumn("k1", "Int8", {1, 2});
check_subcolumn("k2.k3", "Array(String)", {Array{"aa", "bb"}, Array{"cc", "", ""}});
check_subcolumn("k2.k4", "Array(Int8)", {Array{2, 3}, Array{0, 5, 6}});
check_subcolumn("k2.k5", "Array(Int8)", {Array{0, 0}, Array{4, 0, 0}});
}
#endif

View File

@ -1,80 +1,98 @@
#include <DataTypes/Serializations/SerializationString.h>
#include <IO/ReadBufferFromString.h>
#include <IO/WriteBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/Serializations/SerializationObject.h>
#include <Columns/ColumnObject.h> #include <Columns/ColumnObject.h>
#include <Columns/ColumnString.h> #include <DataTypes/DataTypeFactory.h>
#include <Common/FieldVisitorToString.h> #include <DataTypes/Serializations/SerializationObject.h>
#include <IO/ReadBufferFromString.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#if USE_SIMDJSON
using namespace DB; using namespace DB;
TEST(SerializationObject, FromString) TEST(ObjectSerialization, FieldBinarySerialization)
{ {
WriteBufferFromOwnString out; auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, a.b UInt32, a.c Array(String))");
auto serialization = type->getDefaultSerialization();
auto column_string = ColumnString::create(); Object object1 = Object{{"a.c", Array{"Str1", "Str2"}}, {"a.d", Field(42)}, {"a.e", Tuple{Field(43), "Str3"}}};
column_string->insert(R"({"k1" : 1, "k2" : [{"k3" : "aa", "k4" : 2}, {"k3": "bb", "k4": 3}]})"); WriteBufferFromOwnString ostr;
column_string->insert(R"({"k1" : 2, "k2" : [{"k3" : "cc", "k5" : 4}, {"k4": 5}, {"k4": 6}]})"); serialization->serializeBinary(object1, ostr, FormatSettings());
ReadBufferFromString istr(ostr.str());
{ Field object2;
auto serialization = std::make_shared<SerializationString>(); serialization->deserializeBinary(object2, istr, FormatSettings());
ASSERT_EQ(object1, object2.safeGet<Object>());
ISerialization::SerializeBinaryBulkSettings settings;
ISerialization::SerializeBinaryBulkStatePtr state;
settings.position_independent_encoding = false;
settings.getter = [&out](const auto &) { return &out; };
writeIntBinary(static_cast<UInt8>(1), out);
serialization->serializeBinaryBulkStatePrefix(*column_string, settings, state);
serialization->serializeBinaryBulkWithMultipleStreams(*column_string, 0, column_string->size(), settings, state);
serialization->serializeBinaryBulkStateSuffix(settings, state);
}
auto type_object = std::make_shared<DataTypeObject>("json", false);
ColumnPtr result_column = type_object->createColumn();
ReadBufferFromOwnString in(out.str());
{
auto serialization = type_object->getDefaultSerialization();
ISerialization::DeserializeBinaryBulkSettings settings;
ISerialization::DeserializeBinaryBulkStatePtr state;
settings.position_independent_encoding = false;
settings.getter = [&in](const auto &) { return &in; };
serialization->deserializeBinaryBulkStatePrefix(settings, state, nullptr);
serialization->deserializeBinaryBulkWithMultipleStreams(result_column, column_string->size(), settings, state, nullptr);
}
auto & column_object = assert_cast<ColumnObject &>(*result_column->assumeMutable());
column_object.finalize();
ASSERT_TRUE(column_object.size() == 2);
ASSERT_TRUE(column_object.getSubcolumns().size() == 4);
auto check_subcolumn = [&](const auto & name, const auto & type_name, const std::vector<Field> & expected)
{
const auto & subcolumn = column_object.getSubcolumn(PathInData{name});
ASSERT_EQ(subcolumn.getLeastCommonType()->getName(), type_name);
const auto & data = subcolumn.getFinalizedColumn();
for (size_t i = 0; i < expected.size(); ++i)
ASSERT_EQ(
applyVisitor(FieldVisitorToString(), data[i]),
applyVisitor(FieldVisitorToString(), expected[i]));
};
check_subcolumn("k1", "Int8", {1, 2});
check_subcolumn("k2.k3", "Array(String)", {Array{"aa", "bb"}, Array{"cc", "", ""}});
check_subcolumn("k2.k4", "Array(Int8)", {Array{2, 3}, Array{0, 5, 6}});
check_subcolumn("k2.k5", "Array(Int8)", {Array{0, 0}, Array{4, 0, 0}});
} }
#endif
TEST(ObjectSerialization, ColumnBinarySerialization)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, a.b UInt32, a.c Array(String))");
auto serialization = type->getDefaultSerialization();
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"a.c", Array{"Str1", "Str2"}}, {"a.d", Field(42)}, {"a.e", Tuple{Field(43), "Str3"}}});
WriteBufferFromOwnString ostr1;
serialization->serializeBinary(col_object, 0, ostr1, FormatSettings());
ReadBufferFromString istr1(ostr1.str());
serialization->deserializeBinary(col_object, istr1, FormatSettings());
ASSERT_EQ(col_object[0], col_object[1]);
col_object.insert(Object{{"a.c", Array{"Str1", "Str2"}}, {"a.e", Field(42)}, {"b.d", Field(42)}, {"b.e", Tuple{Field(43), "Str3"}}, {"b.g", Field("Str4")}});
WriteBufferFromOwnString ostr2;
serialization->serializeBinary(col_object, 2, ostr2, FormatSettings());
ReadBufferFromString istr2(ostr2.str());
serialization->deserializeBinary(col_object, istr2, FormatSettings());
ASSERT_EQ(col_object[2], col_object[3]);
}
TEST(ObjectSerialization, JSONSerialization)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, a.b UInt32, a.c Array(String))");
auto serialization = type->getDefaultSerialization();
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"a.c", Array{"Str1", "Str2"}}, {"a.d", Field(42)}, {"a.e", Tuple{Field(43), "Str3"}}});
col_object.insert(Object{{"a.c", Array{"Str1", "Str2"}}, {"a", Tuple{Field(43), "Str3"}}, {"a.b.c", Field(42)}, {"a.b.e", Field(43)}, {"b.c.d.e", Field(42)}, {"b.c.d.g", Field(43)}, {"b.c.h.r", Field(44)}, {"c.g.h.t", Array{Field("Str"), Field("Str2")}}, {"h", Field("Str")}, {"j", Field("Str")}});
WriteBufferFromOwnString buf1;
serialization->serializeTextJSON(col_object, 1, buf1, FormatSettings());
ASSERT_EQ(buf1.str(), R"({"a":[43,"Str3"],"a":{"b":0,"b":{"c":42,"e":43},"c":["Str1","Str2"]},"b":{"c":{"d":{"e":42,"g":43},"h":{"r":44}}},"c":{"g":{"h":{"t":["Str","Str2"]}}},"h":"Str","j":"Str"})");
WriteBufferFromOwnString buf2;
serialization->serializeTextJSONPretty(col_object, 1, buf2, FormatSettings(), 0);
ASSERT_EQ(buf2.str(), R"({
"a" : [
43,
"Str3"
],
"a" : {
"b" : 0,
"b" : {
"c" : 42,
"e" : 43
},
"c" : [
"Str1",
"Str2"
]
},
"b" : {
"c" : {
"d" : {
"e" : 42,
"g" : 43
},
"h" : {
"r" : 44
}
}
},
"c" : {
"g" : {
"h" : {
"t" : [
"Str",
"Str2"
]
}
}
},
"h" : "Str",
"j" : "Str"
})");
}

View File

@ -216,6 +216,7 @@ bool canBeSafelyCasted(const DataTypePtr & from_type, const DataTypePtr & to_typ
return false; return false;
} }
case TypeIndex::String: case TypeIndex::String:
case TypeIndex::ObjectDeprecated:
case TypeIndex::Object: case TypeIndex::Object:
case TypeIndex::Set: case TypeIndex::Set:
case TypeIndex::Interval: case TypeIndex::Interval:

View File

@ -126,4 +126,7 @@ GTEST_TEST(DataTypesBinaryEncoding, EncodeAndDecode)
check(DataTypeFactory::instance().get("Polygon")); check(DataTypeFactory::instance().get("Polygon"));
check(DataTypeFactory::instance().get("MultiPolygon")); check(DataTypeFactory::instance().get("MultiPolygon"));
check(DataTypeFactory::instance().get("Tuple(Map(LowCardinality(String), Array(AggregateFunction(2, quantiles(0.1, 0.2), Float32))), Array(Array(Tuple(UInt32, Tuple(a Map(String, String), b Nullable(Date), c Variant(Tuple(g String, d Array(UInt32)), Date, Map(String, String)))))))")); check(DataTypeFactory::instance().get("Tuple(Map(LowCardinality(String), Array(AggregateFunction(2, quantiles(0.1, 0.2), Float32))), Array(Array(Tuple(UInt32, Tuple(a Map(String, String), b Nullable(Date), c Variant(Tuple(g String, d Array(UInt32)), Date, Map(String, String)))))))"));
check(DataTypeFactory::instance().get("JSON"));
check(DataTypeFactory::instance().get("JSON(max_dynamic_paths=10)"));
check(DataTypeFactory::instance().get("JSON(max_dynamic_paths=10, max_dynamic_types=10, a.b.c UInt32, SKIP a.c, b.g String, SKIP l.d.f)"));
} }

View File

@ -1153,6 +1153,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep
query_context->setSetting("allow_experimental_object_type", 1); query_context->setSetting("allow_experimental_object_type", 1);
query_context->setSetting("allow_experimental_variant_type", 1); query_context->setSetting("allow_experimental_variant_type", 1);
query_context->setSetting("allow_experimental_dynamic_type", 1); query_context->setSetting("allow_experimental_dynamic_type", 1);
query_context->setSetting("allow_experimental_json_type", 1);
query_context->setSetting("allow_experimental_vector_similarity_index", 1); query_context->setSetting("allow_experimental_vector_similarity_index", 1);
query_context->setSetting("allow_experimental_bigint_types", 1); query_context->setSetting("allow_experimental_bigint_types", 1);
query_context->setSetting("allow_experimental_window_functions", 1); query_context->setSetting("allow_experimental_window_functions", 1);

View File

@ -464,7 +464,7 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
settings.json.read_arrays_as_strings, settings.json.read_arrays_as_strings,
settings.json.try_infer_objects_as_tuples, settings.json.try_infer_objects_as_tuples,
settings.json.infer_incomplete_types_as_strings, settings.json.infer_incomplete_types_as_strings,
settings.json.allow_object_type, settings.json.allow_deprecated_object_type,
settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects); settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects);
break; break;
default: default:

View File

@ -146,11 +146,13 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.json.validate_types_from_metadata = settings.input_format_json_validate_types_from_metadata; format_settings.json.validate_types_from_metadata = settings.input_format_json_validate_types_from_metadata;
format_settings.json.validate_utf8 = settings.output_format_json_validate_utf8; format_settings.json.validate_utf8 = settings.output_format_json_validate_utf8;
format_settings.json_object_each_row.column_for_object_name = settings.format_json_object_each_row_column_for_object_name; format_settings.json_object_each_row.column_for_object_name = settings.format_json_object_each_row_column_for_object_name;
format_settings.json.allow_object_type = context->getSettingsRef().allow_experimental_object_type; format_settings.json.allow_deprecated_object_type = context->getSettingsRef().allow_experimental_object_type;
format_settings.json.allow_json_type = context->getSettingsRef().allow_experimental_json_type;
format_settings.json.compact_allow_variable_number_of_columns = settings.input_format_json_compact_allow_variable_number_of_columns; format_settings.json.compact_allow_variable_number_of_columns = settings.input_format_json_compact_allow_variable_number_of_columns;
format_settings.json.try_infer_objects_as_tuples = settings.input_format_json_try_infer_named_tuples_from_objects; format_settings.json.try_infer_objects_as_tuples = settings.input_format_json_try_infer_named_tuples_from_objects;
format_settings.json.throw_on_bad_escape_sequence = settings.input_format_json_throw_on_bad_escape_sequence; format_settings.json.throw_on_bad_escape_sequence = settings.input_format_json_throw_on_bad_escape_sequence;
format_settings.json.ignore_unnecessary_fields = settings.input_format_json_ignore_unnecessary_fields; format_settings.json.ignore_unnecessary_fields = settings.input_format_json_ignore_unnecessary_fields;
format_settings.json.type_json_skip_duplicated_paths = settings.type_json_skip_duplicated_paths;
format_settings.null_as_default = settings.input_format_null_as_default; format_settings.null_as_default = settings.input_format_null_as_default;
format_settings.force_null_for_omitted_fields = settings.input_format_force_null_for_omitted_fields; format_settings.force_null_for_omitted_fields = settings.input_format_force_null_for_omitted_fields;
format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros; format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros;

View File

@ -228,13 +228,15 @@ struct FormatSettings
bool try_infer_numbers_from_strings = false; bool try_infer_numbers_from_strings = false;
bool validate_types_from_metadata = true; bool validate_types_from_metadata = true;
bool validate_utf8 = false; bool validate_utf8 = false;
bool allow_object_type = false; bool allow_deprecated_object_type = false;
bool allow_json_type = false;
bool valid_output_on_exception = false; bool valid_output_on_exception = false;
bool compact_allow_variable_number_of_columns = false; bool compact_allow_variable_number_of_columns = false;
bool try_infer_objects_as_tuples = false; bool try_infer_objects_as_tuples = false;
bool infer_incomplete_types_as_strings = true; bool infer_incomplete_types_as_strings = true;
bool throw_on_bad_escape_sequence = true; bool throw_on_bad_escape_sequence = true;
bool ignore_unnecessary_fields = true; bool ignore_unnecessary_fields = true;
bool type_json_skip_duplicated_paths = false;
} json{}; } json{};
struct struct

View File

@ -8,7 +8,6 @@
#if USE_RAPIDJSON #if USE_RAPIDJSON
#include <Common/JSONParsers/RapidJSONParser.h> #include <Common/JSONParsers/RapidJSONParser.h>
#endif #endif
#include <Common/JSONParsers/DummyJSONParser.h> #include <Common/JSONParsers/DummyJSONParser.h>
#include <Columns/ColumnArray.h> #include <Columns/ColumnArray.h>
@ -22,6 +21,7 @@
#include <Columns/ColumnVariant.h> #include <Columns/ColumnVariant.h>
#include <Columns/ColumnVector.h> #include <Columns/ColumnVector.h>
#include <Columns/ColumnsDateTime.h> #include <Columns/ColumnsDateTime.h>
#include <Columns/ColumnObject.h>
#include <DataTypes/DataTypeArray.h> #include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeDateTime.h> #include <DataTypes/DataTypeDateTime.h>
@ -38,8 +38,10 @@
#include <DataTypes/DataTypeVariant.h> #include <DataTypes/DataTypeVariant.h>
#include <DataTypes/DataTypesDecimal.h> #include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypesNumber.h> #include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/Serializations/SerializationDecimal.h> #include <DataTypes/Serializations/SerializationDecimal.h>
#include <DataTypes/Serializations/SerializationVariant.h> #include <DataTypes/Serializations/SerializationVariant.h>
#include <DataTypes/Serializations/SerializationObject.h>
#include <IO/ReadBufferFromMemory.h> #include <IO/ReadBufferFromMemory.h>
@ -53,6 +55,7 @@ namespace DB
namespace ErrorCodes namespace ErrorCodes
{ {
extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int INCORRECT_DATA;
} }
template <typename JSONParser> template <typename JSONParser>
@ -123,7 +126,7 @@ void jsonElementToString(const typename JSONParser::Element & element, WriteBuff
template <typename JSONParser, typename NumberType> template <typename JSONParser, typename NumberType>
bool tryGetNumericValueFromJSONElement( bool tryGetNumericValueFromJSONElement(
NumberType & value, const typename JSONParser::Element & element, bool convert_bool_to_integer, String & error) NumberType & value, const typename JSONParser::Element & element, bool convert_bool_to_integer, bool allow_type_conversion, String & error)
{ {
switch (element.type()) switch (element.type())
{ {
@ -135,7 +138,7 @@ bool tryGetNumericValueFromJSONElement(
/// But it will be more convenient for user to perform conversion. /// But it will be more convenient for user to perform conversion.
value = static_cast<NumberType>(element.getDouble()); value = static_cast<NumberType>(element.getDouble());
} }
else if (!accurate::convertNumeric<Float64, NumberType, false>(element.getDouble(), value)) else if (!allow_type_conversion || !accurate::convertNumeric<Float64, NumberType, false>(element.getDouble(), value))
{ {
error = fmt::format("cannot convert double value {} to {}", element.getDouble(), TypeName<NumberType>); error = fmt::format("cannot convert double value {} to {}", element.getDouble(), TypeName<NumberType>);
return false; return false;
@ -158,7 +161,7 @@ bool tryGetNumericValueFromJSONElement(
case ElementType::BOOL: case ElementType::BOOL:
if constexpr (is_integer<NumberType>) if constexpr (is_integer<NumberType>)
{ {
if (convert_bool_to_integer) if (convert_bool_to_integer && allow_type_conversion)
{ {
value = static_cast<NumberType>(element.getBool()); value = static_cast<NumberType>(element.getBool());
break; break;
@ -166,13 +169,17 @@ bool tryGetNumericValueFromJSONElement(
} }
error = fmt::format("cannot convert bool value to {}", TypeName<NumberType>); error = fmt::format("cannot convert bool value to {}", TypeName<NumberType>);
return false; return false;
case ElementType::STRING: { case ElementType::STRING:
{
if (!allow_type_conversion)
return false;
auto rb = ReadBufferFromMemory{element.getString()}; auto rb = ReadBufferFromMemory{element.getString()};
if constexpr (std::is_floating_point_v<NumberType>) if constexpr (std::is_floating_point_v<NumberType>)
{ {
if (!tryReadFloatText(value, rb) || !rb.eof()) if (!tryReadFloatText(value, rb) || !rb.eof())
{ {
error = fmt::format("cannot parse {} value here: {}", TypeName<NumberType>, element.getString()); error = fmt::format("cannot parse {} value here: \"{}\"", TypeName<NumberType>, element.getString());
return false; return false;
} }
} }
@ -186,13 +193,13 @@ bool tryGetNumericValueFromJSONElement(
rb.position() = rb.buffer().begin(); rb.position() = rb.buffer().begin();
if (!tryReadFloatText(tmp_float, rb) || !rb.eof()) if (!tryReadFloatText(tmp_float, rb) || !rb.eof())
{ {
error = fmt::format("cannot parse {} value here: {}", TypeName<NumberType>, element.getString()); error = fmt::format("cannot parse {} value here: \"{}\"", TypeName<NumberType>, element.getString());
return false; return false;
} }
if (!accurate::convertNumeric<Float64, NumberType, false>(tmp_float, value)) if (!accurate::convertNumeric<Float64, NumberType, false>(tmp_float, value))
{ {
error = fmt::format("cannot parse {} value here: {}", TypeName<NumberType>, element.getString()); error = fmt::format("cannot parse {} value here: \"{}\"", TypeName<NumberType>, element.getString());
return false; return false;
} }
} }
@ -241,8 +248,16 @@ public:
return false; return false;
} }
if (is_bool_type && !insert_settings.allow_type_conversion)
{
if (!element.isBool())
return false;
assert_cast<ColumnVector<NumberType> &>(column).insertValue(element.getBool());
return true;
}
NumberType value; NumberType value;
if (!tryGetNumericValueFromJSONElement<JSONParser, NumberType>(value, element, insert_settings.convert_bool_to_integer || is_bool_type, error)) if (!tryGetNumericValueFromJSONElement<JSONParser, NumberType>(value, element, insert_settings.convert_bool_to_integer || is_bool_type, insert_settings.allow_type_conversion, error))
{ {
if (error.empty()) if (error.empty())
error = fmt::format("cannot read {} value from JSON element: {}", TypeName<NumberType>, jsonElementToString<JSONParser>(element, format_settings)); error = fmt::format("cannot read {} value from JSON element: {}", TypeName<NumberType>, jsonElementToString<JSONParser>(element, format_settings));
@ -289,8 +304,17 @@ public:
return false; return false;
} }
if (this->is_bool_type && !insert_settings.allow_type_conversion)
{
if (!element.isBool())
return false;
UInt8 value = element.getBool();
assert_cast<ColumnLowCardinality &>(column).insertData(reinterpret_cast<const char *>(&value), sizeof(value));
return true;
}
NumberType value; NumberType value;
if (!tryGetNumericValueFromJSONElement<JSONParser, NumberType>(value, element, insert_settings.convert_bool_to_integer || this->is_bool_type, error)) if (!tryGetNumericValueFromJSONElement<JSONParser, NumberType>(value, element, insert_settings.convert_bool_to_integer || this->is_bool_type, insert_settings.allow_type_conversion, error))
{ {
if (error.empty()) if (error.empty())
error = fmt::format("cannot read {} value from JSON element: {}", TypeName<NumberType>, jsonElementToString<JSONParser>(element, format_settings)); error = fmt::format("cannot read {} value from JSON element: {}", TypeName<NumberType>, jsonElementToString<JSONParser>(element, format_settings));
@ -316,7 +340,7 @@ public:
bool insertResultToColumn( bool insertResultToColumn(
IColumn & column, IColumn & column,
const typename JSONParser::Element & element, const typename JSONParser::Element & element,
const JSONExtractInsertSettings &, const JSONExtractInsertSettings & insert_settings,
const FormatSettings & format_settings, const FormatSettings & format_settings,
String & error) const override String & error) const override
{ {
@ -333,6 +357,9 @@ public:
if (!element.isString()) if (!element.isString())
{ {
if (!insert_settings.allow_type_conversion)
return false;
auto & col_str = assert_cast<ColumnString &>(column); auto & col_str = assert_cast<ColumnString &>(column);
auto & chars = col_str.getChars(); auto & chars = col_str.getChars();
WriteBufferFromVector<ColumnString::Chars> buf(chars, AppendModeTag()); WriteBufferFromVector<ColumnString::Chars> buf(chars, AppendModeTag());
@ -360,7 +387,7 @@ public:
bool insertResultToColumn( bool insertResultToColumn(
IColumn & column, IColumn & column,
const typename JSONParser::Element & element, const typename JSONParser::Element & element,
const JSONExtractInsertSettings &, const JSONExtractInsertSettings & insert_settings,
const FormatSettings & format_settings, const FormatSettings & format_settings,
String & error) const override String & error) const override
{ {
@ -378,6 +405,9 @@ public:
if (!element.isString()) if (!element.isString())
{ {
if (!insert_settings.allow_type_conversion)
return false;
auto value = jsonElementToString<JSONParser>(element, format_settings); auto value = jsonElementToString<JSONParser>(element, format_settings);
assert_cast<ColumnLowCardinality &>(column).insertData(value.data(), value.size()); assert_cast<ColumnLowCardinality &>(column).insertData(value.data(), value.size());
} }
@ -402,7 +432,7 @@ public:
bool insertResultToColumn( bool insertResultToColumn(
IColumn & column, IColumn & column,
const typename JSONParser::Element & element, const typename JSONParser::Element & element,
const JSONExtractInsertSettings &, const JSONExtractInsertSettings & insert_settings,
const FormatSettings & format_settings, const FormatSettings & format_settings,
String & error) const override String & error) const override
{ {
@ -419,7 +449,11 @@ public:
} }
if (!element.isString()) if (!element.isString())
{
if (!insert_settings.allow_type_conversion)
return false;
return checkValueSizeAndInsert(column, jsonElementToString<JSONParser>(element, format_settings), error); return checkValueSizeAndInsert(column, jsonElementToString<JSONParser>(element, format_settings), error);
}
return checkValueSizeAndInsert(column, element.getString(), error); return checkValueSizeAndInsert(column, element.getString(), error);
} }
@ -450,7 +484,7 @@ public:
bool insertResultToColumn( bool insertResultToColumn(
IColumn & column, IColumn & column,
const typename JSONParser::Element & element, const typename JSONParser::Element & element,
const JSONExtractInsertSettings &, const JSONExtractInsertSettings & insert_settings,
const FormatSettings & format_settings, const FormatSettings & format_settings,
String & error) const override String & error) const override
{ {
@ -466,7 +500,11 @@ public:
} }
if (!element.isString()) if (!element.isString())
{
if (!insert_settings.allow_type_conversion)
return false;
return checkValueSizeAndInsert(column, jsonElementToString<JSONParser>(element, format_settings), error); return checkValueSizeAndInsert(column, jsonElementToString<JSONParser>(element, format_settings), error);
}
return checkValueSizeAndInsert(column, element.getString(), error); return checkValueSizeAndInsert(column, element.getString(), error);
} }
@ -630,7 +668,7 @@ public:
bool insertResultToColumn( bool insertResultToColumn(
IColumn & column, IColumn & column,
const typename JSONParser::Element & element, const typename JSONParser::Element & element,
const JSONExtractInsertSettings &, const JSONExtractInsertSettings & insert_settings,
const FormatSettings & format_settings, const FormatSettings & format_settings,
String & error) const override String & error) const override
{ {
@ -649,7 +687,7 @@ public:
return false; return false;
} }
} }
else if (element.isUInt64()) else if (element.isUInt64() && insert_settings.allow_type_conversion)
{ {
value = element.getUInt64(); value = element.getUInt64();
} }
@ -712,7 +750,8 @@ public:
case ElementType::INT64: case ElementType::INT64:
value = convertToDecimal<DataTypeNumber<Int64>, DataTypeDecimal<DecimalType>>(element.getInt64(), scale); value = convertToDecimal<DataTypeNumber<Int64>, DataTypeDecimal<DecimalType>>(element.getInt64(), scale);
break; break;
case ElementType::STRING: { case ElementType::STRING:
{
auto rb = ReadBufferFromMemory{element.getString()}; auto rb = ReadBufferFromMemory{element.getString()};
if (!SerializationDecimal<DecimalType>::tryReadText(value, rb, DecimalUtils::max_precision<DecimalType>, scale)) if (!SerializationDecimal<DecimalType>::tryReadText(value, rb, DecimalUtils::max_precision<DecimalType>, scale))
{ {
@ -721,7 +760,8 @@ public:
} }
break; break;
} }
case ElementType::NULL_VALUE: { case ElementType::NULL_VALUE:
{
if (!format_settings.null_as_default) if (!format_settings.null_as_default)
{ {
error = "cannot convert null to Decimal value"; error = "cannot convert null to Decimal value";
@ -756,7 +796,7 @@ public:
bool insertResultToColumn( bool insertResultToColumn(
IColumn & column, IColumn & column,
const typename JSONParser::Element & element, const typename JSONParser::Element & element,
const JSONExtractInsertSettings &, const JSONExtractInsertSettings & insert_settings,
const FormatSettings & format_settings, const FormatSettings & format_settings,
String & error) const override String & error) const override
{ {
@ -777,6 +817,9 @@ public:
} }
else else
{ {
if (!insert_settings.allow_type_conversion)
return false;
switch (element.type()) switch (element.type())
{ {
case ElementType::DOUBLE: case ElementType::DOUBLE:
@ -1104,7 +1147,7 @@ public:
} }
} }
if (!were_valid_elements) if (data.size() != old_size && !were_valid_elements)
{ {
data.popBack(data.size() - old_size); data.popBack(data.size() - old_size);
return false; return false;
@ -1174,7 +1217,7 @@ public:
else else
{ {
set_size(old_size); set_size(old_size);
error += fmt::format("(during reading tuple {} element)", index); error += fmt::format(" (during reading tuple {} element)", index);
return false; return false;
} }
} }
@ -1202,7 +1245,7 @@ public:
else else
{ {
set_size(old_size); set_size(old_size);
error += fmt::format("(during reading tuple {} element)", index); error += fmt::format(" (during reading tuple {} element)", index);
return false; return false;
} }
} }
@ -1221,7 +1264,7 @@ public:
else if (!insert_settings.insert_default_on_invalid_elements_in_complex_types) else if (!insert_settings.insert_default_on_invalid_elements_in_complex_types)
{ {
set_size(old_size); set_size(old_size);
error += fmt::format("(during reading tuple element \"{}\")", key); error += fmt::format(" (during reading tuple element \"{}\")", key);
return false; return false;
} }
} }
@ -1288,7 +1331,7 @@ public:
{ {
key_col.popBack(key_col.size() - offsets.back()); key_col.popBack(key_col.size() - offsets.back());
value_col.popBack(value_col.size() - offsets.back()); value_col.popBack(value_col.size() - offsets.back());
error += fmt::format("(during reading value of key \"{}\")", pair.first); error += fmt::format(" (during reading value of key \"{}\")", pair.first);
return false; return false;
} }
} }
@ -1346,6 +1389,13 @@ template <typename JSONParser>
class DynamicNode : public JSONExtractTreeNode<JSONParser> class DynamicNode : public JSONExtractTreeNode<JSONParser>
{ {
public: public:
explicit DynamicNode(
size_t max_dynamic_paths_for_object_ = DataTypeObject::DEFAULT_MAX_SEPARATELY_STORED_PATHS,
size_t max_dynamic_types_for_object_ = DataTypeDynamic::DEFAULT_MAX_DYNAMIC_TYPES)
: max_dynamic_paths_for_object(max_dynamic_paths_for_object_), max_dynamic_types_for_object(max_dynamic_types_for_object_)
{
}
bool insertResultToColumn( bool insertResultToColumn(
IColumn & column, IColumn & column,
const typename JSONParser::Element & element, const typename JSONParser::Element & element,
@ -1354,7 +1404,7 @@ public:
String & error) const override String & error) const override
{ {
auto & column_dynamic = assert_cast<ColumnDynamic &>(column); auto & column_dynamic = assert_cast<ColumnDynamic &>(column);
/// First, check if element is NULL. /// Check if element is NULL.
if (element.isNull()) if (element.isNull())
{ {
column_dynamic.insertDefault(); column_dynamic.insertDefault();
@ -1363,15 +1413,52 @@ public:
auto & variant_column = column_dynamic.getVariantColumn(); auto & variant_column = column_dynamic.getVariantColumn();
const auto & variant_info = column_dynamic.getVariantInfo(); const auto & variant_info = column_dynamic.getVariantInfo();
/// Second, infer ClickHouse type for this element and add it as a new variant. const auto & variant_types = assert_cast<const DataTypeVariant &>(*variant_info.variant_type).getVariants();
auto element_type = elementToDataType(element, format_settings);
/// Try to insert element into current variants but with no types conversion.
/// We want to avoid inferring the type on each row, so if we can insert this element into
/// any existing variant with no types conversion (like Integer -> String, Double -> Integer, etc)
/// we will do it and won't try to infer the type.
auto shared_variant_discr = column_dynamic.getSharedVariantDiscriminator();
auto insert_settings_with_no_type_conversion = insert_settings;
insert_settings_with_no_type_conversion.allow_type_conversion = false;
for (size_t i = 0; i != variant_info.variant_names.size(); ++i)
{
if (i != shared_variant_discr)
{
auto it = json_extract_nodes_cache.find(variant_info.variant_names[i]);
if (it == json_extract_nodes_cache.end())
it = json_extract_nodes_cache.emplace(variant_info.variant_names[i], buildJSONExtractTree<JSONParser>(variant_types[i], "Dynamic inference")).first;
if (it->second->insertResultToColumn(variant_column.getVariantByGlobalDiscriminator(i), element, insert_settings_with_no_type_conversion, format_settings, error))
{
variant_column.getLocalDiscriminators().push_back(variant_column.localDiscriminatorByGlobal(i));
variant_column.getOffsets().push_back(variant_column.getVariantByGlobalDiscriminator(i).size() - 1);
return true;
}
}
}
/// We couldn't insert element into current variants, infer ClickHouse type for this element and add it as a new variant.
auto element_type = removeNullable(elementToDataType(element, format_settings));
if (!checkIfTypeIsComplete(element_type))
{
throw Exception(
ErrorCodes::INCORRECT_DATA,
"Cannot infer the type of JSON element {}, because it contains only nulls. To use String type for elements with incomplete "
"type, enable setting input_format_json_infer_incomplete_types_as_strings",
jsonElementToString<JSONParser>(element, format_settings));
}
auto element_type_name = element_type->getName(); auto element_type_name = element_type->getName();
if (column_dynamic.addNewVariant(element_type, element_type_name)) if (column_dynamic.addNewVariant(element_type, element_type_name))
{ {
auto node = buildJSONExtractTree<JSONParser>(element_type, "Dynamic inference"); auto it = json_extract_nodes_cache.find(element_type_name);
if (it == json_extract_nodes_cache.end())
it = json_extract_nodes_cache.emplace(element_type_name, buildJSONExtractTree<JSONParser>(element_type, "Dynamic inference")).first;
auto global_discriminator = variant_info.variant_name_to_discriminator.at(element_type_name); auto global_discriminator = variant_info.variant_name_to_discriminator.at(element_type_name);
auto & variant = variant_column.getVariantByGlobalDiscriminator(global_discriminator); auto & variant = variant_column.getVariantByGlobalDiscriminator(global_discriminator);
if (!node->insertResultToColumn(variant, element, insert_settings, format_settings, error)) if (!it->second->insertResultToColumn(variant, element, insert_settings, format_settings, error))
return false; return false;
variant_column.getLocalDiscriminators().push_back(variant_column.localDiscriminatorByGlobal(global_discriminator)); variant_column.getLocalDiscriminators().push_back(variant_column.localDiscriminatorByGlobal(global_discriminator));
variant_column.getOffsets().push_back(variant.size() - 1); variant_column.getOffsets().push_back(variant.size() - 1);
@ -1383,25 +1470,28 @@ public:
auto node = buildJSONExtractTree<JSONParser>(element_type, "Dynamic inference"); auto node = buildJSONExtractTree<JSONParser>(element_type, "Dynamic inference");
if (!node->insertResultToColumn(*tmp_variant_column, element, insert_settings, format_settings, error)) if (!node->insertResultToColumn(*tmp_variant_column, element, insert_settings, format_settings, error))
return false; return false;
column_dynamic.insertValueIntoSharedVariant(*tmp_variant_column, element_type, element_type_name, 0); column_dynamic.insertValueIntoSharedVariant(*tmp_variant_column, element_type, element_type_name, 0);
return true; return true;
} }
static DataTypePtr elementToDataType(const typename JSONParser::Element & element, const FormatSettings & format_settings) DataTypePtr elementToDataType(const typename JSONParser::Element & element, const FormatSettings & format_settings) const
{ {
JSONInferenceInfo json_inference_info; JSONInferenceInfo json_inference_info;
auto type = elementToDataTypeImpl(element, format_settings, json_inference_info); auto type = elementToDataTypeImpl(element, format_settings, json_inference_info);
transformFinalInferredJSONTypeIfNeeded(type, format_settings, &json_inference_info); transformFinalInferredJSONTypeIfNeeded(type, format_settings, &json_inference_info);
if (format_settings.schema_inference_make_columns_nullable && type->haveSubtypes())
type = makeNullableRecursively(type);
return type; return type;
} }
private: private:
static DataTypePtr elementToDataTypeImpl(const typename JSONParser::Element & element, const FormatSettings & format_settings, JSONInferenceInfo & json_inference_info) DataTypePtr elementToDataTypeImpl(const typename JSONParser::Element & element, const FormatSettings & format_settings, JSONInferenceInfo & json_inference_info) const
{ {
switch (element.type()) switch (element.type())
{ {
case ElementType::NULL_VALUE: case ElementType::NULL_VALUE:
return makeNullable(std::make_shared<DataTypeNothing>()); return std::make_shared<DataTypeNullable>(std::make_shared<DataTypeNothing>());
case ElementType::BOOL: case ElementType::BOOL:
return DataTypeFactory::instance().get("Bool"); return DataTypeFactory::instance().get("Bool");
case ElementType::INT64: case ElementType::INT64:
@ -1439,10 +1529,10 @@ private:
DataTypes types; DataTypes types;
types.reserve(array.size()); types.reserve(array.size());
for (auto value : array) for (auto value : array)
types.push_back(makeNullableSafe(elementToDataTypeImpl(value, format_settings, json_inference_info))); types.push_back(elementToDataTypeImpl(value, format_settings, json_inference_info));
if (types.empty()) if (types.empty())
return std::make_shared<DataTypeArray>(makeNullable(std::make_shared<DataTypeNothing>())); return std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>());
if (checkIfTypesAreEqual(types)) if (checkIfTypesAreEqual(types))
return std::make_shared<DataTypeArray>(types.back()); return std::make_shared<DataTypeArray>(types.back());
@ -1469,12 +1559,238 @@ private:
return std::make_shared<DataTypeTuple>(types); return std::make_shared<DataTypeTuple>(types);
} }
case ElementType::OBJECT: { case ElementType::OBJECT:
/// TODO: Use new JSON type here when it's ready. {
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), makeNullable(std::make_shared<DataTypeString>())); return std::make_shared<DataTypeObject>(DataTypeObject::SchemaFormat::JSON, max_dynamic_paths_for_object, max_dynamic_types_for_object);
} }
} }
} }
size_t max_dynamic_paths_for_object;
size_t max_dynamic_types_for_object;
/// Avoid building JSONExtractTreeNode for the same data types on each row by using cache.
mutable std::unordered_map<String, std::unique_ptr<JSONExtractTreeNode<JSONParser>>> json_extract_nodes_cache;
};
template <typename JSONParser>
class ObjectJSONNode : public JSONExtractTreeNode<JSONParser>
{
public:
ObjectJSONNode(
std::unordered_map<String, std::unique_ptr<JSONExtractTreeNode<JSONParser>>> typed_path_nodes_,
const std::unordered_set<String> & paths_to_skip_,
const std::vector<String> & path_regexps_to_skip_,
size_t max_dynamic_paths_,
size_t max_dynamic_types_)
: typed_path_nodes(std::move(typed_path_nodes_))
, paths_to_skip(paths_to_skip_)
, dynamic_node(std::make_unique<DynamicNode<JSONParser>>(
max_dynamic_paths_ / DataTypeObject::NESTED_OBJECT_MAX_DYNAMIC_PATHS_REDUCE_FACTOR,
max_dynamic_types_ / DataTypeObject::NESTED_OBJECT_MAX_DYNAMIC_TYPES_REDUCE_FACTOR))
, dynamic_serialization(std::make_shared<SerializationDynamic>())
{
sorted_paths_to_skip.assign(paths_to_skip.begin(), paths_to_skip.end());
std::sort(sorted_paths_to_skip.begin(), sorted_paths_to_skip.end());
for (const auto & regexp : path_regexps_to_skip_)
path_regexps_to_skip.emplace_back(regexp);
}
bool insertResultToColumn(IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override
{
if (element.isNull() && format_settings.null_as_default)
{
column.insertDefault();
return true;
}
if (!element.isObject())
{
error = fmt::format("Cannot read JSON object from JSON element: {}", jsonElementToString<JSONParser>(element, format_settings));
return false;
}
auto & column_object = assert_cast<ColumnObject &>(column);
size_t prev_size = column_object.size();
/// Paths in shared data should be sorted, so we cannot insert paths there during traverse.
/// Instead we collect all paths and values that should go to shared data, sort them and insert later.
/// It's not optimal, but it's a price we pay for faster reading of subcolumns.
std::vector<std::pair<String, String>> paths_and_values_for_shared_data;
if (!traverseAndInsert(column_object, element, "", insert_settings, format_settings, paths_and_values_for_shared_data, prev_size, error))
{
/// If there was an error, restore previous state.
SerializationObject::restoreColumnObject(column_object, prev_size);
return false;
}
/// Fill shared data.
auto [shared_data_paths, shared_data_values] = column_object.getSharedDataPathsAndValues();
std::sort(paths_and_values_for_shared_data.begin(), paths_and_values_for_shared_data.end());
for (size_t i = 0; i != paths_and_values_for_shared_data.size(); ++i)
{
const auto & [path, value] = paths_and_values_for_shared_data[i];
/// Check if we duplicated paths.
if (i != 0 && path == paths_and_values_for_shared_data[i - 1].first)
{
if (!format_settings.json.type_json_skip_duplicated_paths)
{
error = fmt::format("Duplicate path found during parsing JSON object: {}. You can enable setting type_json_skip_duplicated_paths to skip duplicated paths during insert", path);
SerializationObject::restoreColumnObject(column_object, prev_size);
return false;
}
}
else
{
shared_data_paths->insertData(path.data(), path.size());
shared_data_values->insertData(value.data(), value.size());
}
}
column_object.getSharedDataOffsets().push_back(shared_data_paths->size());
/// Fill remaining typed and dynamic paths.
for (auto & [_, typed_column] : column_object.getTypedPaths())
{
if (typed_column->size() == prev_size)
typed_column->insertDefault();
}
for (auto & [_, dynamic_column] : column_object.getDynamicPathsPtrs())
{
if (dynamic_column->size() == prev_size)
dynamic_column->insertDefault();
}
return true;
}
private:
bool traverseAndInsert(
ColumnObject & column_object,
const typename JSONParser::Element & element,
const String & current_path,
const JSONExtractInsertSettings & insert_settings,
const FormatSettings & format_settings,
std::vector<std::pair<String, String>> & paths_and_values_for_shared_data,
size_t current_size,
String & error) const
{
if (shouldSkipPath(current_path))
return true;
if (element.isObject() && !typed_path_nodes.contains(current_path))
{
for (auto [key, value] : element.getObject())
{
String path = current_path;
if (!path.empty())
path.append(".");
path += key;
if (!traverseAndInsert(column_object, value, path, insert_settings, format_settings, paths_and_values_for_shared_data, current_size, error))
return false;
}
return true;
}
auto & typed_paths = column_object.getTypedPaths();
auto & dynamic_paths_ptrs = column_object.getDynamicPathsPtrs();
/// Check if we have this path in typed paths.
if (auto typed_it = typed_paths.find(current_path); typed_it != typed_paths.end())
{
/// Check if we already had this path.
if (typed_it->second->size() > current_size)
{
if (!format_settings.json.type_json_skip_duplicated_paths)
{
error = fmt::format("Duplicate path found during parsing JSON object: {}. You can enable setting type_json_skip_duplicated_paths to skip duplicated paths during insert", current_path);
return false;
}
}
else if (!typed_path_nodes.at(current_path)->insertResultToColumn(*typed_it->second, element, insert_settings, format_settings, error))
{
error += fmt::format(" (while reading path {})", current_path);
return false;
}
}
/// Check if we have this path in dynamic paths.
else if (auto dynamic_it = dynamic_paths_ptrs.find(current_path); dynamic_it != dynamic_paths_ptrs.end())
{
/// Check if we already had this path.
if (dynamic_it->second->size() > current_size)
{
if (!format_settings.json.type_json_skip_duplicated_paths)
{
error = fmt::format("Duplicate path found during parsing JSON object: {}. You can enable setting type_json_skip_duplicated_paths to skip duplicated paths during insert", current_path);
return false;
}
}
else if (!dynamic_node->insertResultToColumn(*dynamic_it->second, element, insert_settings, format_settings, error))
{
error += fmt::format(" (while reading path {})", current_path);
return false;
}
}
/// Don't create new dynamic paths for null and don't insert null values into shared data.
/// We consider null equivalent to the absence of this path.
else if (element.isNull())
{
}
/// Try to add a new dynamic path.
else if (auto * dynamic_column = column_object.tryToAddNewDynamicPath(current_path))
{
if (!dynamic_node->insertResultToColumn(*dynamic_column, element, insert_settings, format_settings, error))
{
error += fmt::format(" (while reading path {})", current_path);
return false;
}
}
/// Otherwise this path should go to the shared data.
else
{
auto tmp_dynamic_column = ColumnDynamic::create();
tmp_dynamic_column->reserve(1);
if (!dynamic_node->insertResultToColumn(*tmp_dynamic_column, element, insert_settings, format_settings, error))
{
error += fmt::format(" (while reading path {})", current_path);
return false;
}
paths_and_values_for_shared_data.emplace_back(current_path, "");
WriteBufferFromString buf(paths_and_values_for_shared_data.back().second);
dynamic_serialization->serializeBinary(*tmp_dynamic_column, 0, buf, format_settings);
}
return true;
}
bool shouldSkipPath(const String & path) const
{
if (paths_to_skip.contains(path))
return true;
if (!sorted_paths_to_skip.empty())
{
auto it = std::lower_bound(sorted_paths_to_skip.begin(), sorted_paths_to_skip.end(), path);
if (it != sorted_paths_to_skip.begin() && path.starts_with(*std::prev(it)))
return true;
}
for (const auto & regexp : path_regexps_to_skip)
{
if (re2::RE2::FullMatch(path, regexp))
return true;
}
return false;
}
std::unordered_map<String, std::unique_ptr<JSONExtractTreeNode<JSONParser>>> typed_path_nodes;
std::unordered_set<String> paths_to_skip;
std::vector<String> sorted_paths_to_skip;
std::list<re2::RE2> path_regexps_to_skip;
std::unique_ptr<DynamicNode<JSONParser>> dynamic_node;
std::shared_ptr<SerializationDynamic> dynamic_serialization;
}; };
} }
@ -1621,6 +1937,26 @@ std::unique_ptr<JSONExtractTreeNode<JSONParser>> buildJSONExtractTree(const Data
} }
case TypeIndex::Dynamic: case TypeIndex::Dynamic:
return std::make_unique<DynamicNode<JSONParser>>(); return std::make_unique<DynamicNode<JSONParser>>();
case TypeIndex::Object:
{
const auto & object_type = assert_cast<const DataTypeObject &>(*type);
const auto & typed_paths = object_type.getTypedPaths();
std::unordered_map<String, std::unique_ptr<JSONExtractTreeNode<JSONParser>>> typed_path_nodes;
typed_path_nodes.reserve(typed_paths.size());
for (const auto & [path, path_type] : typed_paths)
typed_path_nodes[path] = buildJSONExtractTree<JSONParser>(path_type, source_for_exception_message);
switch (object_type.getSchemaFormat())
{
case DataTypeObject::SchemaFormat::JSON:
return std::make_unique<ObjectJSONNode<JSONParser>>(
std::move(typed_path_nodes),
object_type.getPathsToSkip(),
object_type.getPathRegexpsToSkip(),
object_type.getMaxDynamicPaths(),
object_type.getMaxDynamicTypes());
}
}
default: default:
throw Exception( throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
@ -1638,7 +1974,7 @@ template std::unique_ptr<JSONExtractTreeNode<SimdJSONParser>> buildJSONExtractTr
#if USE_RAPIDJSON #if USE_RAPIDJSON
template void jsonElementToString<RapidJSONParser>(const RapidJSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings); template void jsonElementToString<RapidJSONParser>(const RapidJSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings);
template std::unique_ptr<JSONExtractTreeNode<RapidJSONParser>> buildJSONExtractTree<RapidJSONParser>(const DataTypePtr & type, const char * source_for_exception_message); template std::unique_ptr<JSONExtractTreeNode<RapidJSONParser>> buildJSONExtractTree<RapidJSONParser>(const DataTypePtr & type, const char * source_for_exception_message);
template bool tryGetNumericValueFromJSONElement<RapidJSONParser, Float64>(Float64 & value, const RapidJSONParser::Element & element, bool convert_bool_to_integer, String & error); template bool tryGetNumericValueFromJSONElement<RapidJSONParser, Float64>(Float64 & value, const RapidJSONParser::Element & element, bool convert_bool_to_integer, bool allow_type_conversion, String & error);
#else #else
template void jsonElementToString<DummyJSONParser>(const DummyJSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings); template void jsonElementToString<DummyJSONParser>(const DummyJSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings);
template std::unique_ptr<JSONExtractTreeNode<DummyJSONParser>> buildJSONExtractTree<DummyJSONParser>(const DataTypePtr & type, const char * source_for_exception_message); template std::unique_ptr<JSONExtractTreeNode<DummyJSONParser>> buildJSONExtractTree<DummyJSONParser>(const DataTypePtr & type, const char * source_for_exception_message);

View File

@ -17,6 +17,9 @@ struct JSONExtractInsertSettings
/// For example, if we have [1, "hello", 2] and type Array(UInt32), /// For example, if we have [1, "hello", 2] and type Array(UInt32),
/// we will insert [1, 0, 2] in the column. Used in all JSONExtract functions. /// we will insert [1, 0, 2] in the column. Used in all JSONExtract functions.
bool insert_default_on_invalid_elements_in_complex_types = false; bool insert_default_on_invalid_elements_in_complex_types = false;
/// If false, JSON value will be inserted into column only if type of the value is
/// the same as column type (no conversions like Integer -> String, Integer -> Float, etc).
bool allow_type_conversion = true;
}; };
template <typename JSONParser> template <typename JSONParser>
@ -36,6 +39,6 @@ template <typename JSONParser>
void jsonElementToString(const typename JSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings); void jsonElementToString(const typename JSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings);
template <typename JSONParser, typename NumberType> template <typename JSONParser, typename NumberType>
bool tryGetNumericValueFromJSONElement(NumberType & value, const typename JSONParser::Element & element, bool convert_bool_to_integer, String & error); bool tryGetNumericValueFromJSONElement(NumberType & value, const typename JSONParser::Element & element, bool convert_bool_to_integer, bool allow_type_conversion, String & error);
} }

View File

@ -6,7 +6,7 @@
#include <IO/WriteBufferValidUTF8.h> #include <IO/WriteBufferValidUTF8.h>
#include <DataTypes/Serializations/SerializationNullable.h> #include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeObject.h> #include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/DataTypeFactory.h> #include <DataTypes/DataTypeFactory.h>
#include <base/find_symbols.h> #include <base/find_symbols.h>

View File

@ -11,7 +11,7 @@
#include <DataTypes/DataTypeLowCardinality.h> #include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeNothing.h> #include <DataTypes/DataTypeNothing.h>
#include <DataTypes/transformTypesRecursively.h> #include <DataTypes/transformTypesRecursively.h>
#include <DataTypes/DataTypeObject.h> #include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/DataTypeFactory.h> #include <DataTypes/DataTypeFactory.h>
#include <IO/ReadBufferFromString.h> #include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h> #include <IO/ReadHelpers.h>
@ -1216,8 +1216,8 @@ namespace
{ {
if constexpr (is_json) if constexpr (is_json)
{ {
if (settings.json.allow_object_type) if (settings.json.allow_deprecated_object_type)
return std::make_shared<DataTypeObject>("json", true); return std::make_shared<DataTypeObjectDeprecated>("json", true);
} }
/// Empty Map is Map(Nothing, Nothing) /// Empty Map is Map(Nothing, Nothing)
@ -1226,8 +1226,8 @@ namespace
if constexpr (is_json) if constexpr (is_json)
{ {
if (settings.json.allow_object_type) if (settings.json.allow_deprecated_object_type)
return std::make_shared<DataTypeObject>("json", true); return std::make_shared<DataTypeObjectDeprecated>("json", true);
if (settings.json.read_objects_as_strings) if (settings.json.read_objects_as_strings)
return std::make_shared<DataTypeString>(); return std::make_shared<DataTypeString>();
@ -1282,7 +1282,7 @@ namespace
{ {
if constexpr (is_json) if constexpr (is_json)
{ {
if (!settings.json.allow_object_type && settings.json.try_infer_objects_as_tuples) if (!settings.json.allow_deprecated_object_type && settings.json.try_infer_objects_as_tuples)
return tryInferJSONPaths(buf, settings, json_info, depth); return tryInferJSONPaths(buf, settings, json_info, depth);
} }
@ -1302,7 +1302,7 @@ namespace
if (checkCharCaseInsensitive('n', buf)) if (checkCharCaseInsensitive('n', buf))
{ {
if (checkStringCaseInsensitive("ull", buf)) if (checkStringCaseInsensitive("ull", buf))
return makeNullable(std::make_shared<DataTypeNothing>()); return std::make_shared<DataTypeNullable>(std::make_shared<DataTypeNothing>());
else if (checkStringCaseInsensitive("an", buf)) else if (checkStringCaseInsensitive("an", buf))
return std::make_shared<DataTypeFloat64>(); return std::make_shared<DataTypeFloat64>();
} }
@ -1568,15 +1568,15 @@ DataTypePtr makeNullableRecursively(DataTypePtr type)
return nested_type ? std::make_shared<DataTypeLowCardinality>(nested_type) : nullptr; return nested_type ? std::make_shared<DataTypeLowCardinality>(nested_type) : nullptr;
} }
if (which.isObject()) if (which.isObjectDeprecated())
{ {
const auto * object_type = assert_cast<const DataTypeObject *>(type.get()); const auto * object_type = assert_cast<const DataTypeObjectDeprecated *>(type.get());
if (object_type->hasNullableSubcolumns()) if (object_type->hasNullableSubcolumns())
return type; return type;
return std::make_shared<DataTypeObject>(object_type->getSchemaFormat(), true); return std::make_shared<DataTypeObjectDeprecated>(object_type->getSchemaFormat(), true);
} }
return makeNullable(type); return makeNullableSafe(type);
} }
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header) NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header)

View File

@ -9,6 +9,7 @@
#include <Columns/ColumnMap.h> #include <Columns/ColumnMap.h>
#include <Columns/ColumnNothing.h> #include <Columns/ColumnNothing.h>
#include <Columns/ColumnNullable.h> #include <Columns/ColumnNullable.h>
#include <Columns/ColumnObjectDeprecated.h>
#include <Columns/ColumnObject.h> #include <Columns/ColumnObject.h>
#include <Columns/ColumnString.h> #include <Columns/ColumnString.h>
#include <Columns/ColumnStringHelpers.h> #include <Columns/ColumnStringHelpers.h>
@ -35,6 +36,7 @@
#include <DataTypes/DataTypeNested.h> #include <DataTypes/DataTypeNested.h>
#include <DataTypes/DataTypeNothing.h> #include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/DataTypeObject.h> #include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeString.h> #include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeTuple.h> #include <DataTypes/DataTypeTuple.h>
@ -3880,7 +3882,7 @@ private:
"Expected tuple with {} subcolumn, but got {} subcolumns", "Expected tuple with {} subcolumn, but got {} subcolumns",
tuple_size, column_tuple.getColumns().size()); tuple_size, column_tuple.getColumns().size());
auto res = ColumnObject::create(has_nullable_subcolumns); auto res = ColumnObjectDeprecated::create(has_nullable_subcolumns);
for (size_t i = 0; i < tuple_size; ++i) for (size_t i = 0; i < tuple_size; ++i)
{ {
ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_types[i], "" }}; ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_types[i], "" }};
@ -3957,7 +3959,7 @@ private:
subcolumn->insertDefault(); subcolumn->insertDefault();
} }
auto column_object = ColumnObject::create(has_nullable_subcolumns); auto column_object = ColumnObjectDeprecated::create(has_nullable_subcolumns);
for (auto && [key, subcolumn] : subcolumns) for (auto && [key, subcolumn] : subcolumns)
{ {
PathInData path(key.toView()); PathInData path(key.toView());
@ -3968,7 +3970,7 @@ private:
}; };
} }
WrapperType createObjectWrapper(const DataTypePtr & from_type, const DataTypeObject * to_type) const WrapperType createObjectDeprecatedWrapper(const DataTypePtr & from_type, const DataTypeObjectDeprecated * to_type) const
{ {
if (const auto * from_tuple = checkAndGetDataType<DataTypeTuple>(from_type.get())) if (const auto * from_tuple = checkAndGetDataType<DataTypeTuple>(from_type.get()))
{ {
@ -3987,12 +3989,12 @@ private:
return res; return res;
}; };
} }
else if (checkAndGetDataType<DataTypeObject>(from_type.get())) else if (checkAndGetDataType<DataTypeObjectDeprecated>(from_type.get()))
{ {
return [is_nullable = to_type->hasNullableSubcolumns()] (ColumnsWithTypeAndName & arguments, const DataTypePtr & , const ColumnNullable * , size_t) -> ColumnPtr return [is_nullable = to_type->hasNullableSubcolumns()] (ColumnsWithTypeAndName & arguments, const DataTypePtr & , const ColumnNullable * , size_t) -> ColumnPtr
{ {
const auto & column_object = assert_cast<const ColumnObject &>(*arguments.front().column); const auto & column_object = assert_cast<const ColumnObjectDeprecated &>(*arguments.front().column);
auto res = ColumnObject::create(is_nullable); auto res = ColumnObjectDeprecated::create(is_nullable);
for (size_t i = 0; i < column_object.size(); i++) for (size_t i = 0; i < column_object.size(); i++)
res->insert(column_object[i]); res->insert(column_object[i]);
@ -4005,6 +4007,25 @@ private:
"Cast to Object can be performed only from flatten named Tuple, Map or String. Got: {}", from_type->getName()); "Cast to Object can be performed only from flatten named Tuple, Map or String. Got: {}", from_type->getName());
} }
WrapperType createObjectWrapper(const DataTypePtr & from_type, const DataTypeObject * to_object) const
{
if (checkAndGetDataType<DataTypeString>(from_type.get()))
{
return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count)
{
auto res = ConvertImplGenericFromString<true>::execute(arguments, result_type, nullable_source, input_rows_count, context)->assumeMutable();
res->finalize();
return res;
};
}
/// TODO: support CAST between JSON types with different parameters
/// support CAST from Map to JSON
/// support CAST from Tuple to JSON
/// support CAST from Object('json') to JSON
throw Exception(ErrorCodes::TYPE_MISMATCH, "Cast to {} can be performed only from String. Got: {}", magic_enum::enum_name(to_object->getSchemaFormat()), from_type->getName());
}
WrapperType createVariantToVariantWrapper(const DataTypeVariant & from_variant, const DataTypeVariant & to_variant) const WrapperType createVariantToVariantWrapper(const DataTypeVariant & from_variant, const DataTypeVariant & to_variant) const
{ {
/// We support only extension of variant type, so, only new types can be added. /// We support only extension of variant type, so, only new types can be added.
@ -5079,6 +5100,8 @@ private:
return createTupleWrapper(from_type, checkAndGetDataType<DataTypeTuple>(to_type.get())); return createTupleWrapper(from_type, checkAndGetDataType<DataTypeTuple>(to_type.get()));
case TypeIndex::Map: case TypeIndex::Map:
return createMapWrapper(from_type, checkAndGetDataType<DataTypeMap>(to_type.get())); return createMapWrapper(from_type, checkAndGetDataType<DataTypeMap>(to_type.get()));
case TypeIndex::ObjectDeprecated:
return createObjectDeprecatedWrapper(from_type, checkAndGetDataType<DataTypeObjectDeprecated>(to_type.get()));
case TypeIndex::Object: case TypeIndex::Object:
return createObjectWrapper(from_type, checkAndGetDataType<DataTypeObject>(to_type.get())); return createObjectWrapper(from_type, checkAndGetDataType<DataTypeObject>(to_type.get()));
case TypeIndex::AggregateFunction: case TypeIndex::AggregateFunction:

View File

@ -739,7 +739,7 @@ public:
{ {
NumberType value; NumberType value;
if (!tryGetNumericValueFromJSONElement<JSONParser, NumberType>(value, element, convert_bool_to_integer, error)) if (!tryGetNumericValueFromJSONElement<JSONParser, NumberType>(value, element, convert_bool_to_integer, /*allow_type_conversion=*/true, error))
return false; return false;
auto & col_vec = assert_cast<ColumnVector<NumberType> &>(dest); auto & col_vec = assert_cast<ColumnVector<NumberType> &>(dest);
col_vec.insertValue(value); col_vec.insertValue(value);

518
src/Functions/JSONPaths.cpp Normal file
View File

@ -0,0 +1,518 @@
#include <Functions/IFunction.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/FunctionFactory.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeObject.h>
#include <Core/ColumnNumbers.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnMap.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnArray.h>
#include <DataTypes/DataTypesBinaryEncoding.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int BAD_ARGUMENTS;
}
namespace
{
enum class PathsMode
{
ALL_PATHS,
DYNAMIC_PATHS,
SHARED_DATA_PATHS,
};
struct JSONAllPathsImpl
{
static constexpr auto name = "JSONAllPaths";
static constexpr auto paths_mode = PathsMode::ALL_PATHS;
static constexpr auto with_types = false;
};
struct JSONAllPathsWithTypesImpl
{
static constexpr auto name = "JSONAllPathsWithTypes";
static constexpr auto paths_mode = PathsMode::ALL_PATHS;
static constexpr auto with_types = true;
};
struct JSONDynamicPathsImpl
{
static constexpr auto name = "JSONDynamicPaths";
static constexpr auto paths_mode = PathsMode::DYNAMIC_PATHS;
static constexpr auto with_types = false;
};
struct JSONDynamicPathsWithTypesImpl
{
static constexpr auto name = "JSONDynamicPathsWithTypes";
static constexpr auto paths_mode = PathsMode::DYNAMIC_PATHS;
static constexpr auto with_types = true;
};
struct JSONSharedDataPathsImpl
{
static constexpr auto name = "JSONSharedDataPaths";
static constexpr auto paths_mode = PathsMode::SHARED_DATA_PATHS;
static constexpr auto with_types = false;
};
struct JSONSharedDataPathsWithTypesImpl
{
static constexpr auto name = "JSONSharedDataPathsWithTypes";
static constexpr auto paths_mode = PathsMode::SHARED_DATA_PATHS;
static constexpr auto with_types = true;
};
/// Implements functions that extracts paths and types from JSON object column.
/// Used for introspection of the content of the JSON object column.
template <typename Impl>
class FunctionJSONPaths : public IFunction
{
public:
static constexpr auto name = Impl::name;
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionJSONPaths>(); }
std::string getName() const override
{
return name;
}
size_t getNumberOfArguments() const override { return 1; }
bool useDefaultImplementationForConstants() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
DataTypePtr getReturnTypeImpl(const DataTypes & data_types) const override
{
if (data_types.size() != 1)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires single argument with type JSON", getName());
if (data_types[0]->getTypeId() != TypeIndex::Object)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires argument with type JSON, got: {}", getName(),data_types[0]->getName());
if constexpr (Impl::with_types)
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>());
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
{
const ColumnWithTypeAndName & elem = arguments[0];
const auto * column_object = typeid_cast<const ColumnObject *>(elem.column.get());
if (!column_object)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected column type in function {}. Expected Object column, got {}", getName(), elem.column->getName());
const auto & type_object = assert_cast<const DataTypeObject &>(*elem.type);
if constexpr (Impl::with_types)
return executeWithTypes(*column_object, type_object);
return executeWithoutTypes(*column_object);
}
private:
ColumnPtr executeWithoutTypes(const ColumnObject & column_object) const
{
if constexpr (Impl::paths_mode == PathsMode::SHARED_DATA_PATHS)
{
/// No need to do anything, we already have a column with all sorted paths in shared data.
const auto & shared_data_array = column_object.getSharedDataNestedColumn();
const auto & shared_data_paths = assert_cast<const ColumnTuple &>(shared_data_array.getData()).getColumnPtr(0);
return ColumnArray::create(shared_data_paths, shared_data_array.getOffsetsPtr());
}
auto res = ColumnArray::create(ColumnString::create());
auto & offsets = res->getOffsets();
ColumnString & data = assert_cast<ColumnString &>(res->getData());
if constexpr (Impl::paths_mode == PathsMode::DYNAMIC_PATHS)
{
/// Collect all dynamic paths.
const auto & dynamic_path_columns = column_object.getDynamicPaths();
std::vector<std::string_view> dynamic_paths;
dynamic_paths.reserve(dynamic_path_columns.size());
for (const auto & [path, _] : dynamic_path_columns)
dynamic_paths.push_back(path);
/// We want the resulting arrays of paths to be sorted for consistency.
std::sort(dynamic_paths.begin(), dynamic_paths.end());
size_t size = column_object.size();
for (size_t i = 0; i != size; ++i)
{
for (const auto path : dynamic_paths)
{
/// Don't include path if it contains NULL, because we consider
/// it to be equivalent to the absence of this path in this row.
if (!dynamic_path_columns.find(path)->second->isNullAt(i))
data.insertData(path.data(), path.size());
}
offsets.push_back(data.size());
}
return res;
}
/// Collect all paths: typed, dynamic and paths from shared data.
std::vector<std::string_view> sorted_dynamic_and_typed_paths;
const auto & typed_path_columns = column_object.getTypedPaths();
const auto & dynamic_path_columns = column_object.getDynamicPaths();
sorted_dynamic_and_typed_paths.reserve(typed_path_columns.size() + dynamic_path_columns.size());
for (const auto & [path, _] : typed_path_columns)
sorted_dynamic_and_typed_paths.push_back(path);
for (const auto & [path, _] : dynamic_path_columns)
sorted_dynamic_and_typed_paths.push_back(path);
/// We want the resulting arrays of paths to be sorted for consistency.
std::sort(sorted_dynamic_and_typed_paths.begin(), sorted_dynamic_and_typed_paths.end());
const auto & shared_data_offsets = column_object.getSharedDataOffsets();
const auto [shared_data_paths, _] = column_object.getSharedDataPathsAndValues();
for (size_t i = 0; i != shared_data_offsets.size(); ++i)
{
size_t start = shared_data_offsets[static_cast<ssize_t>(i) - 1];
size_t end = shared_data_offsets[static_cast<ssize_t>(i)];
/// Merge sorted list of paths from shared data and sorted_dynamic_and_typed_paths
size_t sorted_paths_index = 0;
for (size_t j = start; j != end; ++j)
{
auto shared_data_path = shared_data_paths->getDataAt(j).toView();
while (sorted_paths_index != sorted_dynamic_and_typed_paths.size() && sorted_dynamic_and_typed_paths[sorted_paths_index] < shared_data_path)
{
const auto path = sorted_dynamic_and_typed_paths[sorted_paths_index];
/// If it's dynamic path include it only if it's not NULL.
if (auto it = dynamic_path_columns.find(path); it == dynamic_path_columns.end() || !it->second->isNullAt(i))
data.insertData(path.data(), path.size());
++sorted_paths_index;
}
data.insertData(shared_data_path.data(), shared_data_path.size());
}
for (; sorted_paths_index != sorted_dynamic_and_typed_paths.size(); ++sorted_paths_index)
{
const auto path = sorted_dynamic_and_typed_paths[sorted_paths_index];
if (auto it = dynamic_path_columns.find(path); it == dynamic_path_columns.end() || !it->second->isNullAt(i))
data.insertData(path.data(), path.size());
}
offsets.push_back(data.size());
}
return res;
}
ColumnPtr executeWithTypes(const ColumnObject & column_object, const DataTypeObject & type_object) const
{
auto offsets_column = ColumnArray::ColumnOffsets::create();
auto & offsets = offsets_column->getData();
auto paths_column = ColumnString::create();
auto types_column = ColumnString::create();
if constexpr (Impl::paths_mode == PathsMode::DYNAMIC_PATHS)
{
const auto & dynamic_path_columns = column_object.getDynamicPaths();
std::vector<std::string_view> sorted_dynamic_paths;
sorted_dynamic_paths.reserve(dynamic_path_columns.size());
for (const auto & [path, _] : dynamic_path_columns)
sorted_dynamic_paths.push_back(path);
/// We want the resulting arrays of paths and values to be sorted for consistency.
std::sort(sorted_dynamic_paths.begin(), sorted_dynamic_paths.end());
/// Iterate over all rows and extract types from dynamic columns.
for (size_t i = 0; i != column_object.size(); ++i)
{
for (const auto path : sorted_dynamic_paths)
{
const auto & column = dynamic_path_columns.find(path)->second;
if (!column->isNullAt(i))
{
auto type = getDynamicValueType(column, i);
paths_column->insertData(path.data(), path.size());
types_column->insertData(type.data(), type.size());
}
}
offsets.push_back(paths_column->size());
}
return ColumnMap::create(ColumnPtr(std::move(paths_column)), ColumnPtr(std::move(types_column)), ColumnPtr(std::move(offsets_column)));
}
if constexpr (Impl::paths_mode == PathsMode::SHARED_DATA_PATHS)
{
const auto & shared_data_offsets = column_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = column_object.getSharedDataPathsAndValues();
/// Iterate over all rows and extract types from dynamic values in shared data.
for (size_t i = 0; i != shared_data_offsets.size(); ++i)
{
size_t start = shared_data_offsets[static_cast<ssize_t>(i) - 1];
size_t end = shared_data_offsets[static_cast<ssize_t>(i)];
for (size_t j = start; j != end; ++j)
{
if (auto type_name = getDynamicValueTypeFromSharedData(shared_data_values->getDataAt(j)))
{
paths_column->insertFrom(*shared_data_paths, j);
types_column->insertData(type_name->data(), type_name->size());
}
}
offsets.push_back(paths_column->size());
}
return ColumnMap::create(ColumnPtr(std::move(paths_column)), ColumnPtr(std::move(types_column)), ColumnPtr(std::move(offsets_column)));
}
/// Iterate over all rows and extract types from dynamic columns from dynamic paths and from values in shared data.
std::vector<std::pair<std::string_view, String>> sorted_typed_and_dynamic_paths_with_types;
const auto & typed_path_types = type_object.getTypedPaths();
const auto & dynamic_path_columns = column_object.getDynamicPaths();
sorted_typed_and_dynamic_paths_with_types.reserve(typed_path_types.size() + dynamic_path_columns.size());
for (const auto & [path, type] : typed_path_types)
sorted_typed_and_dynamic_paths_with_types.emplace_back(path, type->getName());
for (const auto & [path, _] : dynamic_path_columns)
sorted_typed_and_dynamic_paths_with_types.emplace_back(path, "");
/// We want the resulting arrays of paths and values to be sorted for consistency.
std::sort(sorted_typed_and_dynamic_paths_with_types.begin(), sorted_typed_and_dynamic_paths_with_types.end());
const auto & shared_data_offsets = column_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = column_object.getSharedDataPathsAndValues();
for (size_t i = 0; i != shared_data_offsets.size(); ++i)
{
size_t start = shared_data_offsets[static_cast<ssize_t>(i) - 1];
size_t end = shared_data_offsets[static_cast<ssize_t>(i)];
/// Merge sorted list of paths and values from shared data and sorted_typed_and_dynamic_paths_with_types
size_t sorted_paths_index = 0;
for (size_t j = start; j != end; ++j)
{
auto shared_data_path = shared_data_paths->getDataAt(j).toView();
auto type_name = getDynamicValueTypeFromSharedData(shared_data_values->getDataAt(j));
/// Skip NULL values.
if (!type_name)
continue;
while (sorted_paths_index != sorted_typed_and_dynamic_paths_with_types.size() && sorted_typed_and_dynamic_paths_with_types[sorted_paths_index].first < shared_data_path)
{
auto & [path, type] = sorted_typed_and_dynamic_paths_with_types[sorted_paths_index];
/// Update type for path from dynamic paths.
if (auto it = dynamic_path_columns.find(path); it != dynamic_path_columns.end())
{
/// Skip NULL values.
if (it->second->isNullAt(i))
{
++sorted_paths_index;
continue;
}
type = getDynamicValueType(it->second, i);
}
paths_column->insertData(path.data(), path.size());
types_column->insertData(type.data(), type.size());
++sorted_paths_index;
}
paths_column->insertData(shared_data_path.data(), shared_data_path.size());
types_column->insertData(type_name->data(), type_name->size());
}
for (; sorted_paths_index != sorted_typed_and_dynamic_paths_with_types.size(); ++sorted_paths_index)
{
auto & [path, type] = sorted_typed_and_dynamic_paths_with_types[sorted_paths_index];
if (auto it = dynamic_path_columns.find(path); it != dynamic_path_columns.end())
{
/// Skip NULL values.
if (it->second->isNullAt(i))
continue;
type = getDynamicValueType(it->second, i);
}
paths_column->insertData(path.data(), path.size());
types_column->insertData(type.data(), type.size());
}
offsets.push_back(paths_column->size());
}
return ColumnMap::create(ColumnPtr(std::move(paths_column)), ColumnPtr(std::move(types_column)), ColumnPtr(std::move(offsets_column)));
}
String getDynamicValueType(const ColumnPtr & column, size_t i) const
{
const ColumnDynamic * dynamic_column = checkAndGetColumn<ColumnDynamic>(column.get());
const auto & variant_info = dynamic_column->getVariantInfo();
const auto & variant_column = dynamic_column->getVariantColumn();
auto global_discr = variant_column.globalDiscriminatorAt(i);
/// We don't output path with NULL values. It should be checked before calling getDynamicValueType.
chassert(global_discr != ColumnVariant::NULL_DISCRIMINATOR);
if (global_discr == dynamic_column->getSharedVariantDiscriminator())
{
auto value = dynamic_column->getSharedVariant().getDataAt(variant_column.offsetAt(i));
ReadBufferFromMemory buf(value.data, value.size);
auto type = decodeDataType(buf);
return type->getName();
}
return variant_info.variant_names[global_discr];
}
std::optional<String> getDynamicValueTypeFromSharedData(StringRef value) const
{
ReadBufferFromMemory buf(value.data, value.size);
auto type = decodeDataType(buf);
if (isNothing(type))
return std::nullopt;
return type->getName();
}
};
}
REGISTER_FUNCTION(JSONPaths)
{
factory.registerFunction<FunctionJSONPaths<JSONAllPathsImpl>>(FunctionDocumentation{
.description = R"(
Returns the list of all paths stored in each row in JSON column.
)",
.syntax = {"JSONAllPaths(json)"},
.arguments = {{"json", "JSON column"}},
.examples = {{{
"Example",
R"(
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONAllPaths(json) FROM test;
)",
R"(
jsonJSONAllPaths(json)
{"a":"42"} ['a']
{"b":"Hello"} ['b']
{"a":["1","2","3"],"c":"2020-01-01"} ['a','c']
)"}}},
.categories{"JSON"},
});
factory.registerFunction<FunctionJSONPaths<JSONAllPathsWithTypesImpl>>(FunctionDocumentation{
.description = R"(
Returns the list of all paths and their data types stored in each row in JSON column.
)",
.syntax = {"JSONAllPathsWithTypes(json)"},
.arguments = {{"json", "JSON column"}},
.examples = {{{
"Example",
R"(
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONAllPathsWithTypes(json) FROM test;
)",
R"(
jsonJSONAllPathsWithTypes(json)
{"a":"42"} {'a':'Int64'}
{"b":"Hello"} {'b':'String'}
{"a":["1","2","3"],"c":"2020-01-01"} {'a':'Array(Nullable(Int64))','c':'Date'}
)"}}},
.categories{"JSON"},
});
factory.registerFunction<FunctionJSONPaths<JSONDynamicPathsImpl>>(FunctionDocumentation{
.description = R"(
Returns the list of dynamic paths that are stored as separate subcolumns in JSON column.
)",
.syntax = {"JSONDynamicPaths(json)"},
.arguments = {{"json", "JSON column"}},
.examples = {{{
"Example",
R"(
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONDynamicPaths(json) FROM test;
)",
R"(
jsonJSONDynamicPaths(json)
{"a":"42"} ['a']
{"b":"Hello"} []
{"a":["1","2","3"],"c":"2020-01-01"} ['a']
)"}}},
.categories{"JSON"},
});
factory.registerFunction<FunctionJSONPaths<JSONDynamicPathsWithTypesImpl>>(FunctionDocumentation{
.description = R"(
Returns the list of dynamic paths that are stored as separate subcolumns and their types in each row in JSON column.
)",
.syntax = {"JSONDynamicPathsWithTypes(json)"},
.arguments = {{"json", "JSON column"}},
.examples = {{{
"Example",
R"(
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONDynamicPathsWithTypes(json) FROM test;
)",
R"(
jsonJSONDynamicPathsWithTypes(json)
{"a":"42"} {'a':'Int64'}
{"b":"Hello"} {}
{"a":["1","2","3"],"c":"2020-01-01"} {'a':'Array(Nullable(Int64))'}
)"}}},
.categories{"JSON"},
});
factory.registerFunction<FunctionJSONPaths<JSONSharedDataPathsImpl>>(FunctionDocumentation{
.description = R"(
Returns the list of paths that are stored in shared data structure in JSON column.
)",
.syntax = {"JSONDynamicPaths(json)"},
.arguments = {{"json", "JSON column"}},
.examples = {{{
"Example",
R"(
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONSharedDataPaths(json) FROM test;
)",
R"(
jsonJSONSharedDataPaths(json)
{"a":"42"} []
{"b":"Hello"} ['b']
{"a":["1","2","3"],"c":"2020-01-01"} ['c']
)"}}},
.categories{"JSON"},
});
factory.registerFunction<FunctionJSONPaths<JSONSharedDataPathsWithTypesImpl>>(FunctionDocumentation{
.description = R"(
Returns the list of paths that are stored in shared data structure and their types in each row in JSON column.
)",
.syntax = {"JSONDynamicPathsWithTypes(json)"},
.arguments = {{"json", "JSON column"}},
.examples = {{{
"Example",
R"(
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONDynamicPathsWithTypes(json) FROM test;
)",
R"(
jsonJSONDynamicPathsWithTypes(json)
{"a":"42"} {'a':'Int64'}
{"b":"Hello"} {}
{"a":["1","2","3"],"c":"2020-01-01"} {'a':'Array(Nullable(Int64))'}
)"}}},
.categories{"JSON"},
});
}
}

View File

@ -2,10 +2,18 @@
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringOrArrayToT.h> #include <Functions/FunctionStringOrArrayToT.h>
#include <Functions/EmptyImpl.h> #include <Functions/EmptyImpl.h>
#include <Columns/ColumnObject.h>
namespace DB namespace DB
{ {
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
namespace namespace
{ {
@ -13,13 +21,135 @@ struct NameEmpty
{ {
static constexpr auto name = "empty"; static constexpr auto name = "empty";
}; };
using FunctionEmpty = FunctionStringOrArrayToT<EmptyImpl<false>, NameEmpty, UInt8, false>; using FunctionEmpty = FunctionStringOrArrayToT<EmptyImpl<false>, NameEmpty, UInt8, false>;
/// Implements the empty function for JSON type.
class ExecutableFunctionJSONEmpty : public IExecutableFunction
{
public:
std::string getName() const override { return NameEmpty::name; }
private:
bool useDefaultImplementationForConstants() const override { return true; }
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
{
const ColumnWithTypeAndName & elem = arguments[0];
const auto * object_column = typeid_cast<const ColumnObject *>(elem.column.get());
if (!object_column)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected column type in function {}. Expected Object column, got {}", getName(), elem.column->getName());
auto res = DataTypeUInt8().createColumn();
auto & data = typeid_cast<ColumnUInt8 &>(*res).getData();
const auto & typed_paths = object_column->getTypedPaths();
size_t size = object_column->size();
/// If object column has at least 1 typed path, it will never be empty, because these paths always have values.
if (!typed_paths.empty())
{
data.resize_fill(size, 0);
return res;
}
const auto & dynamic_paths = object_column->getDynamicPaths();
const auto & shared_data = object_column->getSharedDataPtr();
data.reserve(size);
for (size_t i = 0; i != size; ++i)
{
bool empty = true;
/// Check if there is no paths in shared data.
if (!shared_data->isDefaultAt(i))
{
empty = false;
}
/// Check that all dynamic paths have NULL value in this row.
else
{
for (const auto & [path, column] : dynamic_paths)
{
if (!column->isNullAt(i))
{
empty = false;
break;
}
}
}
data.push_back(empty);
}
return res;
}
};
class FunctionEmptyJSON final : public IFunctionBase
{
public:
FunctionEmptyJSON(const DataTypes & argument_types_, const DataTypePtr & return_type_) : argument_types(argument_types_), return_type(return_type_) {}
String getName() const override { return NameEmpty::name; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
const DataTypes & getArgumentTypes() const override { return argument_types; }
const DataTypePtr & getResultType() const override { return return_type; }
ExecutableFunctionPtr prepare(const ColumnsWithTypeAndName &) const override
{
return std::make_unique<ExecutableFunctionJSONEmpty>();
}
private:
DataTypes argument_types;
DataTypePtr return_type;
};
class FunctionEmptyOverloadResolver final : public IFunctionOverloadResolver
{
public:
static constexpr auto name = NameEmpty::name;
static FunctionOverloadResolverPtr create(ContextPtr)
{
return std::make_unique<FunctionEmptyOverloadResolver>();
}
String getName() const override { return NameEmpty::name; }
size_t getNumberOfArguments() const override { return 1; }
FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override
{
DataTypes argument_types;
argument_types.reserve(arguments.size());
for (const auto & arg : arguments)
argument_types.push_back(arg.type);
if (argument_types.size() == 1 && isObject(argument_types[0]))
return std::make_shared<FunctionEmptyJSON>(argument_types, return_type);
return std::make_shared<FunctionToFunctionBaseAdaptor>(std::make_shared<FunctionEmpty>(), argument_types, return_type);
}
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isStringOrFixedString(arguments[0])
&& !isArray(arguments[0])
&& !isMap(arguments[0])
&& !isUUID(arguments[0])
&& !isIPv6(arguments[0])
&& !isIPv4(arguments[0])
&& !isObject(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName());
return std::make_shared<DataTypeUInt8>();
}
};
} }
REGISTER_FUNCTION(Empty) REGISTER_FUNCTION(Empty)
{ {
factory.registerFunction<FunctionEmpty>(); factory.registerFunction<FunctionEmptyOverloadResolver>();
} }
} }

View File

@ -12,6 +12,7 @@
#include <IO/Operators.h> #include <IO/Operators.h>
#include <cstdlib> #include <cstdlib>
#include <bit> #include <bit>
#include <utility>
#include <base/simd.h> #include <base/simd.h>
@ -855,6 +856,12 @@ void readBackQuotedString(String & s, ReadBuffer & buf)
readBackQuotedStringInto<false>(s, buf); readBackQuotedStringInto<false>(s, buf);
} }
bool tryReadBackQuotedString(String & s, ReadBuffer & buf)
{
s.clear();
return readAnyQuotedStringInto<'`', false, String, bool>(s, buf);
}
void readBackQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) void readBackQuotedStringWithSQLStyle(String & s, ReadBuffer & buf)
{ {
s.clear(); s.clear();
@ -1270,6 +1277,81 @@ ReturnType readJSONArrayInto(Vector & s, ReadBuffer & buf)
template void readJSONArrayInto<PaddedPODArray<UInt8>, void>(PaddedPODArray<UInt8> & s, ReadBuffer & buf); template void readJSONArrayInto<PaddedPODArray<UInt8>, void>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
template bool readJSONArrayInto<PaddedPODArray<UInt8>, bool>(PaddedPODArray<UInt8> & s, ReadBuffer & buf); template bool readJSONArrayInto<PaddedPODArray<UInt8>, bool>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
std::string_view readJSONObjectAsViewPossiblyInvalid(ReadBuffer & buf, String & object_buffer)
{
if (buf.eof() || *buf.position() != '{')
throw Exception(ErrorCodes::INCORRECT_DATA, "JSON object should start with '{{'");
char * start = buf.position();
bool use_object_buffer = false;
object_buffer.clear();
++buf.position();
Int64 balance = 1;
bool quotes = false;
while (true)
{
if (!buf.hasPendingData() && !use_object_buffer)
{
use_object_buffer = true;
object_buffer.append(start, buf.position() - start);
}
if (buf.eof())
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading JSON object");
char * next_pos = find_first_symbols<'\\', '{', '}', '"'>(buf.position(), buf.buffer().end());
if (use_object_buffer)
object_buffer.append(buf.position(), next_pos - buf.position());
buf.position() = next_pos;
if (!buf.hasPendingData())
continue;
if (use_object_buffer)
object_buffer.push_back(*buf.position());
if (*buf.position() == '\\')
{
++buf.position();
if (!buf.hasPendingData() && !use_object_buffer)
{
use_object_buffer = true;
object_buffer.append(start, buf.position() - start);
}
if (buf.eof())
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading JSON object");
if (use_object_buffer)
object_buffer.push_back(*buf.position());
++buf.position();
continue;
}
if (*buf.position() == '"')
quotes = !quotes;
else if (!quotes) // can be only opening_bracket or closing_bracket
balance += *buf.position() == '{' ? 1 : -1;
++buf.position();
if (balance == 0)
{
if (use_object_buffer)
return object_buffer;
return {start, buf.position()};
}
if (balance < 0)
break;
}
throw Exception(ErrorCodes::INCORRECT_DATA, "JSON object should have equal number of opening and closing brackets");
}
template <typename ReturnType> template <typename ReturnType>
ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf, const char * allowed_delimiters) ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf, const char * allowed_delimiters)
{ {
@ -1918,6 +2000,11 @@ static ReturnType readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc pa
return ReturnType(true); return ReturnType(true);
} }
void readParsedValueIntoString(String & s, ReadBuffer & buf, std::function<void(ReadBuffer &)> parse_func)
{
readParsedValueInto<void>(s, buf, std::move(parse_func));
}
template <typename ReturnType = void, typename Vector> template <typename ReturnType = void, typename Vector>
static ReturnType readQuotedStringFieldInto(Vector & s, ReadBuffer & buf) static ReturnType readQuotedStringFieldInto(Vector & s, ReadBuffer & buf)
{ {

View File

@ -600,6 +600,7 @@ bool tryReadDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf);
void readJSONString(String & s, ReadBuffer & buf, const FormatSettings::JSON & settings); void readJSONString(String & s, ReadBuffer & buf, const FormatSettings::JSON & settings);
void readBackQuotedString(String & s, ReadBuffer & buf); void readBackQuotedString(String & s, ReadBuffer & buf);
bool tryReadBackQuotedString(String & s, ReadBuffer & buf);
void readBackQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); void readBackQuotedStringWithSQLStyle(String & s, ReadBuffer & buf);
void readStringUntilEOF(String & s, ReadBuffer & buf); void readStringUntilEOF(String & s, ReadBuffer & buf);
@ -687,6 +688,10 @@ ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf);
template <typename Vector, typename ReturnType = void> template <typename Vector, typename ReturnType = void>
ReturnType readJSONArrayInto(Vector & s, ReadBuffer & buf); ReturnType readJSONArrayInto(Vector & s, ReadBuffer & buf);
/// Similar to readJSONObjectPossiblyInvalid but avoids copying the data if JSON object fits into current read buffer
/// If copying is unavoidable, it copies data into provided object_buffer and returns string_view to it.
std::string_view readJSONObjectAsViewPossiblyInvalid(ReadBuffer & buf, String & object_buffer);
template <typename Vector> template <typename Vector>
void readStringUntilWhitespaceInto(Vector & s, ReadBuffer & buf); void readStringUntilWhitespaceInto(Vector & s, ReadBuffer & buf);
@ -1920,6 +1925,8 @@ struct PcgDeserializer
} }
}; };
void readParsedValueIntoString(String & s, ReadBuffer & buf, std::function<void(ReadBuffer &)> parse_func);
template <typename ReturnType = void, typename Vector> template <typename ReturnType = void, typename Vector>
ReturnType readQuotedFieldInto(Vector & s, ReadBuffer & buf); ReturnType readQuotedFieldInto(Vector & s, ReadBuffer & buf);

View File

@ -593,8 +593,8 @@ ReturnType parseDateTimeBestEffortImpl(
else else
return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected word"); return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected word");
while (!in.eof() && isAlphaASCII(*in.position())) // while (!in.eof() && isAlphaASCII(*in.position()))
++in.position(); // ++in.position();
/// For RFC 2822 /// For RFC 2822
if (has_day_of_week) if (has_day_of_week)

View File

@ -68,6 +68,7 @@ WITH map(
'Map', 'JSON', 'Map', 'JSON',
'Tuple', 'JSON', 'Tuple', 'JSON',
'Object', 'JSON', 'Object', 'JSON',
'JSON', 'JSON',
'String', '{}', 'String', '{}',
'FixedString', '{}') AS native_to_mysql_mapping, 'FixedString', '{}') AS native_to_mysql_mapping,
)", )",

View File

@ -500,6 +500,12 @@ static void validateUpdateColumns(
throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no column {} in table", backQuote(column_name)); throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no column {} in table", backQuote(column_name));
} }
} }
else if (storage_columns.getColumn(GetColumnsOptions::Ordinary, column_name).type->hasDynamicSubcolumns())
{
throw Exception(ErrorCodes::CANNOT_UPDATE_COLUMN,
"Cannot update column {} with type {}: updates of columns with dynamic subcolumns are not supported",
backQuote(column_name), storage_columns.getColumn(GetColumnsOptions::Ordinary, column_name).type->getName());
}
} }
} }

View File

@ -47,10 +47,10 @@
#include <Parsers/queryToString.h> #include <Parsers/queryToString.h>
#include <Parsers/ASTCreateQuery.h> #include <Parsers/ASTCreateQuery.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeLowCardinality.h> #include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/NestedUtils.h>
#include <IO/WriteHelpers.h> #include <IO/WriteHelpers.h>
#include <Storages/IStorage.h> #include <Storages/IStorage.h>
@ -1173,9 +1173,9 @@ bool TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
if (object_pos != std::string::npos) if (object_pos != std::string::npos)
{ {
String object_name = it->substr(0, object_pos); String object_name = it->substr(0, object_pos);
if (pair.name == object_name && pair.type->getTypeId() == TypeIndex::Object) if (pair.name == object_name && pair.type->getTypeId() == TypeIndex::ObjectDeprecated)
{ {
const auto * object_type = typeid_cast<const DataTypeObject *>(pair.type.get()); const auto * object_type = typeid_cast<const DataTypeObjectDeprecated *>(pair.type.get());
if (object_type->getSchemaFormat() == "json" && object_type->hasNullableSubcolumns()) if (object_type->getSchemaFormat() == "json" && object_type->hasNullableSubcolumns())
{ {
missed_subcolumns.insert(*it); missed_subcolumns.insert(*it);

View File

@ -463,7 +463,7 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID
return src; return src;
} }
else if (isObject(type)) else if (isObjectDeprecated(type))
{ {
if (src.getType() == Field::Types::Object) if (src.getType() == Field::Types::Object)
return src; /// Already in needed type. return src; /// Already in needed type.
@ -523,6 +523,13 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID
/// We can insert any field to Dynamic column. /// We can insert any field to Dynamic column.
return src; return src;
} }
else if (isObject(type))
{
if (src.getType() == Field::Types::Object)
return src; /// Already in needed type.
/// TODO: add conversion from Map/Tuple to Object.
}
/// Conversion from string by parsing. /// Conversion from string by parsing.
if (src.getType() == Field::Types::String) if (src.getType() == Field::Types::String)

View File

@ -3,6 +3,7 @@
#include <DataTypes/DataTypeLowCardinality.h> #include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeVariant.h> #include <DataTypes/DataTypeVariant.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/getLeastSupertype.h> #include <DataTypes/getLeastSupertype.h>
#include <Interpreters/Context.h> #include <Interpreters/Context.h>
#include <Interpreters/InterpreterCreateQuery.h> #include <Interpreters/InterpreterCreateQuery.h>
@ -30,6 +31,7 @@ DataTypeValidationSettings::DataTypeValidationSettings(const DB::Settings& setti
, allow_suspicious_variant_types(settings.allow_suspicious_variant_types) , allow_suspicious_variant_types(settings.allow_suspicious_variant_types)
, validate_nested_types(settings.validate_experimental_and_suspicious_types_inside_nested_types) , validate_nested_types(settings.validate_experimental_and_suspicious_types_inside_nested_types)
, allow_experimental_dynamic_type(settings.allow_experimental_dynamic_type) , allow_experimental_dynamic_type(settings.allow_experimental_dynamic_type)
, allow_experimental_json_type(settings.allow_experimental_json_type)
{ {
} }
@ -123,7 +125,7 @@ void validateDataType(const DataTypePtr & type_to_check, const DataTypeValidatio
if (!settings.allow_experimental_dynamic_type) if (!settings.allow_experimental_dynamic_type)
{ {
if (data_type.hasDynamicSubcolumns()) if (isDynamic(data_type))
{ {
throw Exception( throw Exception(
ErrorCodes::ILLEGAL_COLUMN, ErrorCodes::ILLEGAL_COLUMN,
@ -132,6 +134,19 @@ void validateDataType(const DataTypePtr & type_to_check, const DataTypeValidatio
data_type.getName()); data_type.getName());
} }
} }
if (!settings.allow_experimental_json_type)
{
const auto * object_type = typeid_cast<const DataTypeObject *>(&data_type);
if (object_type && object_type->getSchemaFormat() == DataTypeObject::SchemaFormat::JSON)
{
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"Cannot create column with type '{}' because experimental JSON type is not allowed. "
"Set setting allow_experimental_json_type = 1 in order to allow it",
data_type.getName());
}
}
}; };
validate_callback(*type_to_check); validate_callback(*type_to_check);

View File

@ -23,6 +23,7 @@ struct DataTypeValidationSettings
bool allow_suspicious_variant_types = true; bool allow_suspicious_variant_types = true;
bool validate_nested_types = true; bool validate_nested_types = true;
bool allow_experimental_dynamic_type = true; bool allow_experimental_dynamic_type = true;
bool allow_experimental_json_type = true;
}; };
void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings & settings); void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings & settings);

Some files were not shown because too many files have changed in this diff Show More