Merge remote-tracking branch 'origin/master' into musysroot

This commit is contained in:
Michael Kolupaev 2024-08-17 03:28:31 +00:00
commit 52c5619a92
448 changed files with 18929 additions and 2967 deletions

View File

@ -9,4 +9,14 @@ target_include_directories(_usearch SYSTEM INTERFACE
${SIMSIMD_PROJECT_DIR}/include
${USEARCH_PROJECT_DIR}/include)
target_compile_definitions(_usearch INTERFACE USEARCH_USE_FP16LIB)
# target_compile_definitions(_usearch INTERFACE USEARCH_USE_SIMSIMD)
# ^^ simsimd is not enabled at the moment. Reasons:
# - Vectorization is important for raw scans but not so much for HNSW. We use usearch only for HNSW.
# - Simsimd does compile-time dispatch (choice of SIMD kernels determined by capabilities of the build machine) or dynamic dispatch (SIMD
# kernels chosen at runtime based on cpuid instruction). Since current builds are limited to SSE 4.2 (x86) and NEON (ARM), the speedup of
# the former would be moderate compared to AVX-512 / SVE. The latter is at the moment too fragile with respect to portability across x86
# and ARM machines ... certain conbinations of quantizations / distance functions / SIMD instructions are not implemented at the moment.
add_library(ch_contrib::usearch ALIAS _usearch)

View File

@ -129,6 +129,7 @@ configure
# Check that all new/changed setting were added in settings changes history.
# Some settings can be different for builds with sanitizers, so we check
# Also the automatic value of 'max_threads' and similar was displayed as "'auto(...)'" in previous versions instead of "auto(...)".
# settings changes only for non-sanitizer builds.
IS_SANITIZED=$(clickhouse-local --query "SELECT value LIKE '%-fsanitize=%' FROM system.build_options WHERE name = 'CXX_FLAGS'")
if [ "${IS_SANITIZED}" -eq "0" ]
@ -145,7 +146,9 @@ then
old_settings.value AS old_value
FROM new_settings
LEFT JOIN old_settings ON new_settings.name = old_settings.name
WHERE (new_settings.value != old_settings.value) AND (name NOT IN (
WHERE (new_value != old_value)
AND NOT (startsWith(new_value, 'auto(') AND old_value LIKE '%auto(%')
AND (name NOT IN (
SELECT arrayJoin(tupleElement(changes, 'name'))
FROM
(
@ -177,7 +180,7 @@ then
if [ -s changed_settings.txt ]
then
mv changed_settings.txt /test_output/
echo -e "Changed settings are not reflected in settings changes history (see changed_settings.txt)$FAIL$(head_escaped /test_output/changed_settings.txt)" >> /test_output/test_results.tsv
echo -e "Changed settings are not reflected in the settings changes history (see changed_settings.txt)$FAIL$(head_escaped /test_output/changed_settings.txt)" >> /test_output/test_results.tsv
else
echo -e "There are no changed settings or they are reflected in settings changes history$OK" >> /test_output/test_results.tsv
fi

View File

@ -22,10 +22,10 @@ ORDER BY Distance(vectors, Point)
LIMIT N
```
`vectors` contains N-dimensional values of type [Array(Float32)](../../../sql-reference/data-types/array.md), for example embeddings.
Function `Distance` computes the distance between two vectors. Often, the Euclidean (L2) distance is chosen as distance function but [other
distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point, e.g. `(0.17,
0.33, ...)`, and `N` limits the number of search results.
`vectors` contains N-dimensional values of type [Array(Float32)](../../../sql-reference/data-types/array.md) or Array(Float64), for example
embeddings. Function `Distance` computes the distance between two vectors. Often, the Euclidean (L2) distance is chosen as distance function
but [other distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point,
e.g. `(0.17, 0.33, ...)`, and `N` limits the number of search results.
This query returns the top-`N` closest points to the reference point. Parameter `N` limits the number of returned values which is useful for
situations where `MaxDistance` is difficult to determine in advance.

View File

@ -10,7 +10,7 @@ sidebar_label: Visual Interfaces
### ch-ui {#ch-ui}
[ch-ui](https://github.com/caioricciuti/ch-ui) is a simple React.js app interface for ClickHouse databases, designed for executing queries and visualizing data. Built with React and the ClickHouse client for web, it offers a sleek and user-friendly UI for easy database interactions.
[ch-ui](https://github.com/caioricciuti/ch-ui) is a simple React.js app interface for ClickHouse databases designed for executing queries and visualizing data. Built with React and the ClickHouse client for web, it offers a sleek and user-friendly UI for easy database interactions.
Features:
@ -25,7 +25,7 @@ Web interface for ClickHouse in the [Tabix](https://github.com/tabixio/tabix) pr
Features:
- Works with ClickHouse directly from the browser, without the need to install additional software.
- Works with ClickHouse directly from the browser without the need to install additional software.
- Query editor with syntax highlighting.
- Auto-completion of commands.
- Tools for graphical analysis of query execution.
@ -63,7 +63,7 @@ Features:
- Table list with filtering and metadata.
- Table preview with filtering and sorting.
- Read-only queries execution.
- Read-only query execution.
### Redash {#redash}
@ -75,23 +75,23 @@ Features:
- Powerful editor of queries.
- Database explorer.
- Visualization tools, that allow you to represent data in different forms.
- Visualization tool that allows you to represent data in different forms.
### Grafana {#grafana}
[Grafana](https://grafana.com/grafana/plugins/grafana-clickhouse-datasource/) is a platform for monitoring and visualization.
"Grafana allows you to query, visualize, alert on and understand your metrics no matter where they are stored. Create, explore, and share dashboards with your team and foster a data driven culture. Trusted and loved by the community" — grafana.com.
"Grafana allows you to query, visualize, alert on and understand your metrics no matter where they are stored. Create, explore, and share dashboards with your team and foster a data-driven culture. Trusted and loved by the community" — grafana.com.
ClickHouse datasource plugin provides a support for ClickHouse as a backend database.
ClickHouse data source plugin provides support for ClickHouse as a backend database.
### qryn (#qryn)
### qryn {#qryn}
[qryn](https://metrico.in) is a polyglot, high-performance observability stack for ClickHouse _(formerly cLoki)_ with native Grafana integrations allowing users to ingest and analyze logs, metrics and telemetry traces from any agent supporting Loki/LogQL, Prometheus/PromQL, OTLP/Tempo, Elastic, InfluxDB and many more.
Features:
- Built in Explore UI and LogQL CLI for querying, extracting and visualizing data
- Built-in Explore UI and LogQL CLI for querying, extracting and visualizing data
- Native Grafana APIs support for querying, processing, ingesting, tracing and alerting without plugins
- Powerful pipeline to dynamically search, filter and extract data from logs, events, traces and beyond
- Ingestion and PUSH APIs transparently compatible with LogQL, PromQL, InfluxDB, Elastic and many more
@ -139,7 +139,7 @@ Features:
### DBM {#dbm}
[DBM](https://dbm.incubator.edurt.io/) DBM is a visual management tool for ClickHouse!
[DBM](https://github.com/devlive-community/dbm) DBM is a visual management tool for ClickHouse!
Features:
@ -151,7 +151,7 @@ Features:
- Support custom query
- Support multiple data sources management(connection test, monitoring)
- Support monitor (processor, connection, query)
- Support migrate data
- Support migrating data
### Bytebase {#bytebase}
@ -169,7 +169,7 @@ Features:
### Zeppelin-Interpreter-for-ClickHouse {#zeppelin-interpreter-for-clickhouse}
[Zeppelin-Interpreter-for-ClickHouse](https://github.com/SiderZhang/Zeppelin-Interpreter-for-ClickHouse) is a [Zeppelin](https://zeppelin.apache.org) interpreter for ClickHouse. Compared with JDBC interpreter, it can provide better timeout control for long running queries.
[Zeppelin-Interpreter-for-ClickHouse](https://github.com/SiderZhang/Zeppelin-Interpreter-for-ClickHouse) is a [Zeppelin](https://zeppelin.apache.org) interpreter for ClickHouse. Compared with the JDBC interpreter, it can provide better timeout control for long-running queries.
### ClickCat {#clickcat}
@ -179,7 +179,7 @@ Features:
- An online SQL editor which can run your SQL code without any installing.
- You can observe all processes and mutations. For those unfinished processes, you can kill them in ui.
- The Metrics contains Cluster Analysis,Data Analysis,Query Analysis.
- The Metrics contain Cluster Analysis, Data Analysis, and Query Analysis.
### ClickVisual {#clickvisual}
@ -332,7 +332,7 @@ Learn more about the product at [TABLUM.IO](https://tablum.io/)
### CKMAN {#ckman}
[CKMAN] (https://www.github.com/housepower/ckman) is a tool for managing and monitoring ClickHouse clusters!
[CKMAN](https://www.github.com/housepower/ckman) is a tool for managing and monitoring ClickHouse clusters!
Features:

View File

@ -307,8 +307,22 @@ SELECT dictGet('dict', 'B', 2);
## Named collections for accessing PostgreSQL database
The description of parameters see [postgresql](../sql-reference/table-functions/postgresql.md).
The description of parameters see [postgresql](../sql-reference/table-functions/postgresql.md). Additionally, there are aliases:
- `username` for `user`
- `db` for `database`.
Parameter `addresses_expr` is used in a collection instead of `host:port`. The parameter is optional, because there are other optional ones: `host`, `hostname`, `port`. The following pseudo code explains the priority:
```sql
CASE
WHEN collection['addresses_expr'] != '' THEN collection['addresses_expr']
WHEN collection['host'] != '' THEN collection['host'] || ':' || if(collection['port'] != '', collection['port'], '5432')
WHEN collection['hostname'] != '' THEN collection['hostname'] || ':' || if(collection['port'] != '', collection['port'], '5432')
END
```
Example of creation:
```sql
CREATE NAMED COLLECTION mypg AS
user = 'pguser',
@ -316,7 +330,7 @@ password = 'jw8s0F4',
host = '127.0.0.1',
port = 5432,
database = 'test',
schema = 'test_schema',
schema = 'test_schema'
```
Example of configuration:
@ -369,6 +383,10 @@ SELECT * FROM mypgtable;
└───┘
```
:::note
PostgreSQL copies data from the named collection when the table is being created. A change in the collection does not affect the existing tables.
:::
### Example of using named collections with database with engine PostgreSQL
```sql

View File

@ -5620,6 +5620,19 @@ Minimal size of block to compress in CROSS JOIN. Zero value means - disable this
Default value: `1GiB`.
## use_json_alias_for_old_object_type
When enabled, `JSON` data type alias will be used to create an old [Object('json')](../../sql-reference/data-types/json.md) type instead of the new [JSON](../../sql-reference/data-types/newjson.md) type.
This setting requires server restart to take effect when changed.
Default value: `false`.
## type_json_skip_duplicated_paths
When enabled, ClickHouse will skip duplicated paths during parsing of [JSON](../../sql-reference/data-types/newjson.md) object. Only the value of the first occurrence of each path will be inserted.
Default value: `false`
## restore_replace_external_engines_to_null
For testing purposes. Replaces all external engines to Null to not initiate external connections.

View File

@ -17,7 +17,8 @@ Columns:
- `duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — How long the last refresh attempt took.
- `next_refresh_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Time at which the next refresh is scheduled to start.
- `remaining_dependencies` ([Array(String)](../../sql-reference/data-types/array.md)) — If the view has [refresh dependencies](../../sql-reference/statements/create/view.md#refresh-dependencies), this array contains the subset of those dependencies that are not satisfied for the current refresh yet. If `status = 'WaitingForDependencies'`, a refresh is ready to start as soon as these dependencies are fulfilled.
- `exception` ([String](../../sql-reference/data-types/string.md)) — if `last_refresh_result = 'Exception'`, i.e. the last refresh attempt failed, this column contains the corresponding error message and stack trace.
- `exception` ([String](../../sql-reference/data-types/string.md)) — if `last_refresh_result = 'Error'`, i.e. the last refresh attempt failed, this column contains the corresponding error message and stack trace.
- `retry` ([UInt64](../../sql-reference/data-types/int-uint.md)) — If nonzero, the current or next refresh is a retry (see `refresh_retries` refresh setting), and `retry` is the 1-based index of that retry.
- `refresh_count` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of successful refreshes since last server restart or table creation.
- `progress` ([Float64](../../sql-reference/data-types/float.md)) — Progress of the current refresh, between 0 and 1.
- `read_rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of rows read by the current refresh so far.

View File

@ -13,7 +13,7 @@ The table below describes how each data type is represented in binary format. Ea
`var_uint` in the binary encoding means that the size is encoded using Variable-Length Quantity compression.
| ClickHouse data type | Binary encoding |
|--------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|-----------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `Nothing` | `0x00` |
| `UInt8` | `0x01` |
| `UInt16` | `0x02` |
@ -62,7 +62,9 @@ The table below describes how each data type is represented in binary format. Ea
| `Bool` | `0x2D` |
| `SimpleAggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN)` | `0x2E<var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N>` (see [aggregate function parameter binary encoding](#aggregate-function-parameter-binary-encoding)) |
| `Nested(name1 T1, ..., nameN TN)` | `0x2F<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N>` |
| `JSON(max_dynamic_paths=N, max_dynamic_types=M, path Type, SKIP skip_path, SKIP REGEXP skip_path_regexp)` | `0x30<uint8_serialization_version><var_int_max_dynamic_paths><uint8_max_dynamic_types><var_uint_number_of_typed_paths><var_uint_path_name_size_1><path_name_data_1><encoded_type_1>...<var_uint_number_of_skip_paths><var_uint_skip_path_size_1><skip_path_data_1>...<var_uint_number_of_skip_path_regexps><var_uint_skip_path_regexp_size_1><skip_path_data_regexp_1>...` |
For type `JSON` byte `uint8_serialization_version` indicates the version of the serialization. Right now the version is always 0 but can change in future if new arguments will be introduced for `JSON` type.
### Interval kind binary encoding

View File

@ -19,7 +19,8 @@ ClickHouse data types include:
- **Boolean**: ClickHouse has a [`Boolean` type](./boolean.md)
- **Strings**: [`String`](./string.md) and [`FixedString`](./fixedstring.md)
- **Dates**: use [`Date`](./date.md) and [`Date32`](./date32.md) for days, and [`DateTime`](./datetime.md) and [`DateTime64`](./datetime64.md) for instances in time
- **JSON**: the [`JSON` object](./json.md) stores a JSON document in a single column
- **Object**: the [`Object`](./json.md) stores a JSON document in a single column (deprecated)
- **JSON**: the [`JSON` object](./newjson.md) stores a JSON document in a single column
- **UUID**: a performant option for storing [`UUID` values](./uuid.md)
- **Low cardinality types**: use an [`Enum`](./enum.md) when you have a handful of unique values, or use [`LowCardinality`](./lowcardinality.md) when you have up to 10,000 unique values of a column
- **Arrays**: any column can be defined as an [`Array` of values](./array.md)

View File

@ -13,7 +13,7 @@ keywords: [object, data type]
Stores JavaScript Object Notation (JSON) documents in a single column.
`JSON` is an alias for `Object('json')`.
`JSON` can be used as an alias to `Object('json')` when setting [use_json_alias_for_old_object_type](../../operations/settings/settings.md#usejsonaliasforoldobjecttype) is enabled.
## Example
@ -79,5 +79,5 @@ SELECT * FROM json FORMAT JSONEachRow
## Related Content
- [Using JSON in ClickHouse](/docs/en/integrations/data-formats/json)
- [Using JSON in ClickHouse](/en/integrations/data-formats/json/overview)
- [Getting Data Into ClickHouse - Part 2 - A JSON detour](https://clickhouse.com/blog/getting-data-into-clickhouse-part-2-json)

View File

@ -0,0 +1,516 @@
---
slug: /en/sql-reference/data-types/newjson
sidebar_position: 63
sidebar_label: JSON
keywords: [json, data type]
---
# JSON
Stores JavaScript Object Notation (JSON) documents in a single column.
:::note
This feature is experimental and is not production-ready. If you need to work with JSON documents, consider using [this guide](/docs/en/integrations/data-formats/json/overview) instead.
If you want to use JSON type, set `allow_experimental_json_type = 1`.
:::
To declare a column of `JSON` type, use the following syntax:
``` sql
<column_name> JSON(max_dynamic_paths=N, max_dynamic_types=M, some.path TypeName, SKIP path.to.skip, SKIP REGEXP 'paths_regexp')
```
Where:
- `max_dynamic_paths` is an optional parameter indicating how many paths can be stored separately as subcolumns across single block of data that is stored separately (for example across single data part for MergeTree table). If this limit is exceeded, all other paths will be stored together in a single structure. Default value of `max_dynamic_paths` is `1024`.
- `max_dynamic_types` is an optional parameter between `1` and `255` indicating how many different data types can be stored inside a single path column with type `Dynamic` across single block of data that is stored separately (for example across single data part for MergeTree table). If this limit is exceeded, all new types will be converted to type `String`. Default value of `max_dynamic_types` is `32`.
- `some.path TypeName` is an optional type hint for particular path in the JSON. Such paths will be always stored as subcolumns with specified type.
- `SKIP path.to.skip` is an optional hint for particular path that should be skipped during JSON parsing. Such paths will never be stored in the JSON column. If specified path is a nested JSON object, the whole nested object will be skipped.
- `SKIP REGEXP 'path_regexp'` is an optional hint with a regular expression that is used to skip paths during JSON parsing. All paths that match this regular expression will never be stored in the JSON column.
## Creating JSON
Using `JSON` type in table column definition:
```sql
CREATE TABLE test (json JSON) ENGINE = Memory;
INSERT INTO test VALUES ('{"a" : {"b" : 42}, "c" : [1, 2, 3]}'), ('{"f" : "Hello, World!"}'), ('{"a" : {"b" : 43, "e" : 10}, "c" : [4, 5, 6]}');
SELECT json FROM test;
```
```text
┌─json────────────────────────────────────────┐
│ {"a":{"b":"42"},"c":["1","2","3"]} │
│ {"f":"Hello, World!"} │
│ {"a":{"b":"43","e":"10"},"c":["4","5","6"]} │
└─────────────────────────────────────────────┘
```
```sql
CREATE TABLE test (json JSON(a.b UInt32, SKIP a.e)) ENGINE = Memory;
INSERT INTO test VALUES ('{"a" : {"b" : 42}, "c" : [1, 2, 3]}'), ('{"f" : "Hello, World!"}'), ('{"a" : {"b" : 43, "e" : 10}, "c" : [4, 5, 6]}');
SELECT json FROM test;
```
```text
┌─json──────────────────────────────┐
│ {"a":{"b":42},"c":[1,2,3]} │
│ {"a":{"b":0},"f":"Hello, World!"} │
│ {"a":{"b":43},"c":[4,5,6]} │
└───────────────────────────────────┘
```
Using CAST from 'String':
```sql
SELECT '{"a" : {"b" : 42},"c" : [1, 2, 3], "d" : "Hello, World!"}'::JSON as json;
```
```text
┌─json───────────────────────────────────────────┐
│ {"a":{"b":42},"c":[1,2,3],"d":"Hello, World!"} │
└────────────────────────────────────────────────┘
```
CAST from named `Tuple`, `Map` and `Object('json')` to `JSON` type will be supported later.
## Reading JSON paths as subcolumns
JSON type supports reading every path as a separate subcolumn. If type of the requested path was not specified in the JSON type declaration, the subcolumn of the path will always have type [Dynamic](/docs/en/sql-reference/data-types/dynamic.md).
For example:
```sql
CREATE TABLE test (json JSON(a.b UInt32, SKIP a.e)) ENGINE = Memory;
INSERT INTO test VALUES ('{"a" : {"b" : 42, "g" : 42.42}, "c" : [1, 2, 3], "d" : "2020-01-01"}'), ('{"f" : "Hello, World!", "d" : "2020-01-02"}'), ('{"a" : {"b" : 43, "e" : 10, "g" : 43.43}, "c" : [4, 5, 6]}');
SELECT json FROM test;
```
```text
┌─json──────────────────────────────────────────────────┐
│ {"a":{"b":42,"g":42.42},"c":[1,2,3],"d":"2020-01-01"} │
│ {"a":{"b":0},"d":"2020-01-02","f":"Hello, World!"} │
│ {"a":{"b":43,"g":43.43},"c":[4,5,6]} │
└───────────────────────────────────────────────────────┘
```
```sql
SELECT json.a.b, json.a.g, json.c, json.d FROM test;
```
```text
┌─json.a.b─┬─json.a.g─┬─json.c──┬─json.d─────┐
│ 42 │ 42.42 │ [1,2,3] │ 2020-01-01 │
│ 0 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2020-01-02 │
│ 43 │ 43.43 │ [4,5,6] │ ᴺᵁᴸᴸ │
└──────────┴──────────┴─────────┴────────────┘
```
If the requested path wasn't found in the data, it will be filled with `NULL` values:
```sql
SELECT json.non.existing.path FROM test;
```
```text
┌─json.non.existing.path─┐
│ ᴺᵁᴸᴸ │
│ ᴺᵁᴸᴸ │
│ ᴺᵁᴸᴸ │
└────────────────────────┘
```
Let's check the data types of returned subcolumns:
```sql
SELECT toTypeName(json.a.b), toTypeName(json.a.g), toTypeName(json.c), toTypeName(json.d) FROM test;
```
```text
┌─toTypeName(json.a.b)─┬─toTypeName(json.a.g)─┬─toTypeName(json.c)─┬─toTypeName(json.d)─┐
│ UInt32 │ Dynamic │ Dynamic │ Dynamic │
│ UInt32 │ Dynamic │ Dynamic │ Dynamic │
│ UInt32 │ Dynamic │ Dynamic │ Dynamic │
└──────────────────────┴──────────────────────┴────────────────────┴────────────────────┘
```
As we can see, for `a.b` the type is `UInt32` as we specified in the JSON type declaration, and for all other subcolumns the type is `Dynamic`.
It is also possible to read subcolumns of a `Dynamic` type using special syntax `json.some.path.:TypeName`:
```sql
select json.a.g.:Float64, dynamicType(json.a.g), json.d.:Date, dynamicType(json.d) FROM test;
```
```text
┌─json.a.g.:`Float64`─┬─dynamicType(json.a.g)─┬─json.d.:`Date`─┬─dynamicType(json.d)─┐
│ 42.42 │ Float64 │ 2020-01-01 │ Date │
│ ᴺᵁᴸᴸ │ None │ 2020-01-02 │ Date │
│ 43.43 │ Float64 │ ᴺᵁᴸᴸ │ None │
└─────────────────────┴───────────────────────┴────────────────┴─────────────────────┘
```
`Dynamic` subcolumns can be casted to any data type. In this case the exception will be thrown if internal type inside `Dynamic` cannot be casted to the requested type:
```sql
select json.a.g::UInt64 as uint FROM test;
```
```text
┌─uint─┐
│ 42 │
│ 0 │
│ 43 │
└──────┘
```
```sql
select json.a.g::UUID as float FROM test;
```
```text
Received exception:
Code: 48. DB::Exception: Conversion between numeric types and UUID is not supported. Probably the passed UUID is unquoted: while executing 'FUNCTION CAST(__table1.json.a.g :: 2, 'UUID'_String :: 1) -> CAST(__table1.json.a.g, 'UUID'_String) UUID : 0'. (NOT_IMPLEMENTED)
```
## Reading JSON sub-objects as subcolumns
JSON type supports reading nested objects as subcolumns with type `JSON` using special syntax `json.^some.path`:
```sql
CREATE TABLE test (json JSON) ENGINE = Memory;
INSERT INTO test VALUES ('{"a" : {"b" : {"c" : 42, "g" : 42.42}}, "c" : [1, 2, 3], "d" : {"e" : {"f" : {"g" : "Hello, World", "h" : [1, 2, 3]}}}}'), ('{"f" : "Hello, World!", "d" : {"e" : {"f" : {"h" : [4, 5, 6]}}}}'), ('{"a" : {"b" : {"c" : 43, "e" : 10, "g" : 43.43}}, "c" : [4, 5, 6]}');
SELECT json FROM test;
```
```text
┌─json────────────────────────────────────────────────────────────────────────────────────────┐
│ {"a":{"b":{"c":42,"g":42.42}},"c":[1,2,3],"d":{"e":{"f":{"g":"Hello, World","h":[1,2,3]}}}} │
│ {"d":{"e":{"f":{"h":[4,5,6]}}},"f":"Hello, World!"} │
│ {"a":{"b":{"c":43,"e":10,"g":43.43}},"c":[4,5,6]} │
└─────────────────────────────────────────────────────────────────────────────────────────────┘
```
```sql
SELECT json.^a.b, json.^d.e.f FROM test;
```
```text
┌─json.^`a`.b───────────────┬─json.^`d`.e.f────────────────────┐
│ {"c":42,"g":42.42} │ {"g":"Hello, World","h":[1,2,3]} │
│ {} │ {"h":[4,5,6]} │
│ {"c":43,"e":10,"g":43.43} │ {} │
└───────────────────────────┴──────────────────────────────────┘
```
:::note
Reading sub-objects as subcolumns may be inefficient, as this may require almost full scan of the JSON data.
:::
## Types inference for paths
During JSON parsing ClickHouse tries to detect the most appropriate data type for each JSON path. It works similar to [automatic schema inference from input data](/docs/en/interfaces/schema-inference.md) and controlled by the same settings:
- [input_format_try_infer_integers](/docs/en/interfaces/schema-inference.md#inputformattryinferintegers)
- [input_format_try_infer_dates](/docs/en/interfaces/schema-inference.md#inputformattryinferdates)
- [input_format_try_infer_datetimes](/docs/en/interfaces/schema-inference.md#inputformattryinferdatetimes)
- [schema_inference_make_columns_nullable](/docs/en/interfaces/schema-inference.md#schemainferencemakecolumnsnullable)
- [input_format_json_try_infer_numbers_from_strings](/docs/en/interfaces/schema-inference.md#inputformatjsontryinfernumbersfromstrings)
- [input_format_json_infer_incomplete_types_as_strings](/docs/en/interfaces/schema-inference.md#inputformatjsoninferincompletetypesasstrings)
- [input_format_json_read_numbers_as_strings](/docs/en/interfaces/schema-inference.md#inputformatjsonreadnumbersasstrings)
- [input_format_json_read_bools_as_strings](/docs/en/interfaces/schema-inference.md#inputformatjsonreadboolsasstrings)
- [input_format_json_read_bools_as_numbers](/docs/en/interfaces/schema-inference.md#inputformatjsonreadboolsasnumbers)
- [input_format_json_read_arrays_as_strings](/docs/en/interfaces/schema-inference.md#inputformatjsonreadarraysasstrings)
Let's see some examples:
```sql
SELECT JSONAllPathsWithTypes('{"a" : "2020-01-01", "b" : "2020-01-01 10:00:00"}'::JSON) AS paths_with_types settings input_format_try_infer_dates=1, input_format_try_infer_datetimes=1;
```
```text
┌─paths_with_types─────────────────┐
│ {'a':'Date','b':'DateTime64(9)'} │
└──────────────────────────────────┘
```
```sql
SELECT JSONAllPathsWithTypes('{"a" : "2020-01-01", "b" : "2020-01-01 10:00:00"}'::JSON) AS paths_with_types settings input_format_try_infer_dates=0, input_format_try_infer_datetimes=0;
```
```text
┌─paths_with_types────────────┐
│ {'a':'String','b':'String'} │
└─────────────────────────────┘
```
```sql
SELECT JSONAllPathsWithTypes('{"a" : [1, 2, 3]}'::JSON) AS paths_with_types settings schema_inference_make_columns_nullable=1;
```
```text
┌─paths_with_types───────────────┐
│ {'a':'Array(Nullable(Int64))'} │
└────────────────────────────────┘
```
```sql
SELECT JSONAllPathsWithTypes('{"a" : [1, 2, 3]}'::JSON) AS paths_with_types settings schema_inference_make_columns_nullable=0;
```
```text
┌─paths_with_types─────┐
│ {'a':'Array(Int64)'} │
└──────────────────────┘
```
## Handling arrays of JSON objects
JSON paths that contains an array of objects are parsed as type `Array(JSON)` and inserted into `Dynamic` column for this path. To read an array of objects you can extract it from `Dynamic` column as a subcolumn:
```sql
CREATE TABLE test (json JSON) ENGINE = Memory;
INSERT INTO test VALUES
('{"a" : {"b" : [{"c" : 42, "d" : "Hello", "f" : [[{"g" : 42.42}]], "k" : {"j" : 1000}}, {"c" : 43}, {"e" : [1, 2, 3], "d" : "My", "f" : [[{"g" : 43.43, "h" : "2020-01-01"}]], "k" : {"j" : 2000}}]}}'),
('{"a" : {"b" : [1, 2, 3]}}'),
('{"a" : {"b" : [{"c" : 44, "f" : [[{"h" : "2020-01-02"}]]}, {"e" : [4, 5, 6], "d" : "World", "f" : [[{"g" : 44.44}]], "k" : {"j" : 3000}}]}}');
SELECT json FROM test;
```
```text3
┌─json────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ {"a":{"b":[{"c":"42","d":"Hello","f":[[{"g":42.42}]],"k":{"j":"1000"}},{"c":"43"},{"d":"My","e":["1","2","3"],"f":[[{"g":43.43,"h":"2020-01-01"}]],"k":{"j":"2000"}}]}} │
│ {"a":{"b":["1","2","3"]}} │
│ {"a":{"b":[{"c":"44","f":[[{"h":"2020-01-02"}]]},{"d":"World","e":["4","5","6"],"f":[[{"g":44.44}]],"k":{"j":"3000"}}]}} │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
```
```sql
SELECT json.a.b, dynamicType(json.a.b) FROM test;
```
```text
┌─json.a.b──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─dynamicType(json.a.b)────────────────────────────────────┐
│ ['{"c":"42","d":"Hello","f":[[{"g":42.42}]],"k":{"j":"1000"}}','{"c":"43"}','{"d":"My","e":["1","2","3"],"f":[[{"g":43.43,"h":"2020-01-01"}]],"k":{"j":"2000"}}'] │ Array(JSON(max_dynamic_types=16, max_dynamic_paths=256)) │
│ [1,2,3] │ Array(Nullable(Int64)) │
│ ['{"c":"44","f":[[{"h":"2020-01-02"}]]}','{"d":"World","e":["4","5","6"],"f":[[{"g":44.44}]],"k":{"j":"3000"}}'] │ Array(JSON(max_dynamic_types=16, max_dynamic_paths=256)) │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────┘
```
As you can notice, the `max_dynamic_types/max_dynamic_paths` parameters of the nested `JSON` type were reduced compared to the default values. It's needed to avoid number of subcolumns to grow uncontrolled on nested arrays of JSON objects.
Let's try to read subcolumns from this nested `JSON` column:
```sql
SELECT json.a.b.:`Array(JSON)`.c, json.a.b.:`Array(JSON)`.f, json.a.b.:`Array(JSON)`.d FROM test;
```
```text
┌─json.a.b.:`Array(JSON)`.c─┬─json.a.b.:`Array(JSON)`.f───────────────────────────────────┬─json.a.b.:`Array(JSON)`.d─┐
│ [42,43,NULL] │ [[['{"g":42.42}']],NULL,[['{"g":43.43,"h":"2020-01-01"}']]] │ ['Hello',NULL,'My'] │
│ [] │ [] │ [] │
│ [44,NULL] │ [[['{"h":"2020-01-02"}']],[['{"g":44.44}']]] │ [NULL,'World'] │
└───────────────────────────┴─────────────────────────────────────────────────────────────┴───────────────────────────┘
```
We can avoid writing `Array(JSON)` subcolumn name using special syntax:
```sql
SELECT json.a.b[].c, json.a.b[].f, json.a.b[].d FROM test;
```
```text
┌─json.a.b.:`Array(JSON)`.c─┬─json.a.b.:`Array(JSON)`.f───────────────────────────────────┬─json.a.b.:`Array(JSON)`.d─┐
│ [42,43,NULL] │ [[['{"g":42.42}']],NULL,[['{"g":43.43,"h":"2020-01-01"}']]] │ ['Hello',NULL,'My'] │
│ [] │ [] │ [] │
│ [44,NULL] │ [[['{"h":"2020-01-02"}']],[['{"g":44.44}']]] │ [NULL,'World'] │
└───────────────────────────┴─────────────────────────────────────────────────────────────┴───────────────────────────┘
```
The number of `[]` after path indicates the array level. `json.path[][]` will be transformed to `json.path.:Array(Array(JSON))`
Let's check the paths and types inside our `Array(JSON)`:
```sql
SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(json.a.b[]))) FROM test;
```
```text
┌─arrayJoin(JSONAllPathsWithTypes(arrayJoin(json.a.b.:`Array(JSON)`)))──┐
│ ('c','Int64') │
│ ('d','String') │
│ ('f','Array(Array(JSON(max_dynamic_types=8, max_dynamic_paths=64)))') │
│ ('k.j','Int64') │
│ ('e','Array(Nullable(Int64))') │
└───────────────────────────────────────────────────────────────────────┘
```
Let's read subcolumns from `Array(JSON)` column:
```sql
SELECT json.a.b[].c.:Int64, json.a.b[].f[][].g.:Float64, json.a.b[].f[][].h.:Date FROM test;
```
```text
┌─json.a.b.:`Array(JSON)`.c.:`Int64`─┬─json.a.b.:`Array(JSON)`.f.:`Array(Array(JSON))`.g.:`Float64`─┬─json.a.b.:`Array(JSON)`.f.:`Array(Array(JSON))`.h.:`Date`─┐
│ [42,43,NULL] │ [[[42.42]],[],[[43.43]]] │ [[[NULL]],[],[['2020-01-01']]] │
│ [] │ [] │ [] │
│ [44,NULL] │ [[[NULL]],[[44.44]]] │ [[['2020-01-02']],[[NULL]]] │
└────────────────────────────────────┴──────────────────────────────────────────────────────────────┴───────────────────────────────────────────────────────────┘
```
We can also read sub-object subcolumns from nested `JSON` column:
```sql
SELECT json.a.b[].^k FROM test
```
```text
┌─json.a.b.:`Array(JSON)`.^`k`─────────┐
│ ['{"j":"1000"}','{}','{"j":"2000"}'] │
│ [] │
│ ['{}','{"j":"3000"}'] │
└──────────────────────────────────────┘
```
## Reading JSON type from the data
All text formats (JSONEachRow, TSV, CSV, CustomSeparated, Values, etc) supports reading `JSON` type.
Examples:
```sql
SELECT json FROM format(JSONEachRow, 'json JSON(a.b.c UInt32, SKIP a.b.d, SKIP d.e, SKIP REGEXP \'b.*\')', '
{"json" : {"a" : {"b" : {"c" : 1, "d" : [0, 1]}}, "b" : "2020-01-01", "c" : 42, "d" : {"e" : {"f" : ["s1", "s2"]}, "i" : [1, 2, 3]}}}
{"json" : {"a" : {"b" : {"c" : 2, "d" : [2, 3]}}, "b" : [1, 2, 3], "c" : null, "d" : {"e" : {"g" : 43}, "i" : [4, 5, 6]}}}
{"json" : {"a" : {"b" : {"c" : 3, "d" : [4, 5]}}, "b" : {"c" : 10}, "e" : "Hello, World!"}}
{"json" : {"a" : {"b" : {"c" : 4, "d" : [6, 7]}}, "c" : 43}}
{"json" : {"a" : {"b" : {"c" : 5, "d" : [8, 9]}}, "b" : {"c" : 11, "j" : [1, 2, 3]}, "d" : {"e" : {"f" : ["s3", "s4"], "g" : 44}, "h" : "2020-02-02 10:00:00"}}}
')
```
```text
┌─json──────────────────────────────────────────────────────────┐
│ {"a":{"b":{"c":1}},"c":"42","d":{"i":["1","2","3"]}} │
│ {"a":{"b":{"c":2}},"d":{"i":["4","5","6"]}} │
│ {"a":{"b":{"c":3}},"e":"Hello, World!"} │
│ {"a":{"b":{"c":4}},"c":"43"} │
│ {"a":{"b":{"c":5}},"d":{"h":"2020-02-02 10:00:00.000000000"}} │
└───────────────────────────────────────────────────────────────┘
```
For text formats like CSV/TSV/etc `JSON` is parsed from a string containing JSON object
```sql
SELECT json FROM format(TSV, 'json JSON(a.b.c UInt32, SKIP a.b.d, SKIP REGEXP \'b.*\')',
'{"a" : {"b" : {"c" : 1, "d" : [0, 1]}}, "b" : "2020-01-01", "c" : 42, "d" : {"e" : {"f" : ["s1", "s2"]}, "i" : [1, 2, 3]}}
{"a" : {"b" : {"c" : 2, "d" : [2, 3]}}, "b" : [1, 2, 3], "c" : null, "d" : {"e" : {"g" : 43}, "i" : [4, 5, 6]}}
{"a" : {"b" : {"c" : 3, "d" : [4, 5]}}, "b" : {"c" : 10}, "e" : "Hello, World!"}
{"a" : {"b" : {"c" : 4, "d" : [6, 7]}}, "c" : 43}
{"a" : {"b" : {"c" : 5, "d" : [8, 9]}}, "b" : {"c" : 11, "j" : [1, 2, 3]}, "d" : {"e" : {"f" : ["s3", "s4"], "g" : 44}, "h" : "2020-02-02 10:00:00"}}')
```
```text
┌─json──────────────────────────────────────────────────────────┐
│ {"a":{"b":{"c":1}},"c":"42","d":{"i":["1","2","3"]}} │
│ {"a":{"b":{"c":2}},"d":{"i":["4","5","6"]}} │
│ {"a":{"b":{"c":3}},"e":"Hello, World!"} │
│ {"a":{"b":{"c":4}},"c":"43"} │
│ {"a":{"b":{"c":5}},"d":{"h":"2020-02-02 10:00:00.000000000"}} │
└───────────────────────────────────────────────────────────────┘
```
## Reaching the limit of dynamic paths inside JSON
`JSON` data type can store only limited number of paths as separate subcolumns inside. By default, this limit is 1024, but you can change it in type declaration using parameter `max_dynamic_paths`.
When the limit is reached, all new paths inserted to `JSON` column will be stored in a single shared data structure. It's still possible to read such paths as subcolumns, but it will require reading the whole
shared data structure to extract the values of this path. This limit is needed to avoid the enormous number of different subcolumns that can make the table unusable.
Let's see what happens when the limit is reached in different scenarios.
### Reaching the limit during data parsing
During parsing of `JSON` object from the data, when the limit is reached for current block of data, all new paths will be stored in a shared data structure. We can check it using introspection functions `JSONDynamicPaths, JSONSharedDataPaths`:
```sql
SELECT json, JSONDynamicPaths(json), JSONSharedDataPaths(json) FROM format(JSONEachRow, 'json JSON(max_dynamic_paths=3)', '
{"json" : {"a" : {"b" : 42}, "c" : [1, 2, 3]}}
{"json" : {"a" : {"b" : 43}, "d" : "2020-01-01"}}
{"json" : {"a" : {"b" : 44}, "c" : [4, 5, 6]}}
{"json" : {"a" : {"b" : 43}, "d" : "2020-01-02", "e" : "Hello", "f" : {"g" : 42.42}}}
{"json" : {"a" : {"b" : 43}, "c" : [7, 8, 9], "f" : {"g" : 43.43}, "h" : "World"}}
')
```
```text
┌─json───────────────────────────────────────────────────────────┬─JSONDynamicPaths(json)─┬─JSONSharedDataPaths(json)─┐
│ {"a":{"b":"42"},"c":["1","2","3"]} │ ['a.b','c','d'] │ [] │
│ {"a":{"b":"43"},"d":"2020-01-01"} │ ['a.b','c','d'] │ [] │
│ {"a":{"b":"44"},"c":["4","5","6"]} │ ['a.b','c','d'] │ [] │
│ {"a":{"b":"43"},"d":"2020-01-02","e":"Hello","f":{"g":42.42}} │ ['a.b','c','d'] │ ['e','f.g'] │
│ {"a":{"b":"43"},"c":["7","8","9"],"f":{"g":43.43},"h":"World"} │ ['a.b','c','d'] │ ['f.g','h'] │
└────────────────────────────────────────────────────────────────┴────────────────────────┴───────────────────────────┘
```
As we can see, after inserting paths `e` and `f.g` the limit was reached and we inserted them into shared data structure.
### During merges of data parts in MergeTree table engines
During merge of several data parts in MergeTree table the `JSON` column in the resulting data part can reach the limit of dynamic paths won't be able to store all paths from source parts as subcolumns.
In this case ClickHouse chooses what paths will remain as subcolumns after merge and what types will be stored in the shared data structure. In most cases ClickHouse tries to keep paths that contains
the largest number of non-null values and move the rarest paths to the shared data structure, but it depends on the implementation.
Let's see an example of such merge. First, let's create a table with `JSON` column, set the limit of dynamic paths to `3` and insert values with `5` different paths:
```sql
CREATE TABLE test (id UInt64, json JSON(max_dynamic_paths=3)) engine=MergeTree ORDER BY id;
SYSTEM STOP MERGES test;
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as a) FROM numbers(5);
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as b) FROM numbers(4);
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as c) FROM numbers(3);
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as d) FROM numbers(2);
INSERT INTO test SELECT number, formatRow('JSONEachRow', number as e) FROM numbers(1);
```
Each insert will create a separate data pert with `JSON` column containing single path:
```sql
SELECT count(), JSONDynamicPaths(json) AS dynamic_paths, JSONSharedDataPaths(json) AS shared_data_paths, _part FROM test GROUP BY _part, dynamic_paths, shared_data_paths ORDER BY _part ASC
```
```text
┌─count()─┬─dynamic_paths─┬─shared_data_paths─┬─_part─────┐
│ 5 │ ['a'] │ [] │ all_1_1_0 │
│ 4 │ ['b'] │ [] │ all_2_2_0 │
│ 3 │ ['c'] │ [] │ all_3_3_0 │
│ 2 │ ['d'] │ [] │ all_4_4_0 │
│ 1 │ ['e'] │ [] │ all_5_5_0 │
└─────────┴───────────────┴───────────────────┴───────────┘
```
Now, let's merge all parts into one and see what will happen:
```sql
SYSTEM START MERGES test;
OPTIMIZE TABLE test FINAL;
SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) ORDER BY _part;
```
```text
┌─count()─┬─dynamic_paths─┬─shared_data_paths─┬─_part─────┐
│ 1 │ ['a','b','c'] │ ['e'] │ all_1_5_2 │
│ 2 │ ['a','b','c'] │ ['d'] │ all_1_5_2 │
│ 12 │ ['a','b','c'] │ [] │ all_1_5_2 │
└─────────┴───────────────┴───────────────────┴───────────┘
```
As we can see, ClickHouse kept the most frequent paths `a`, `b` and `c` and moved paths `e` and `d` to shared data structure.
## Introspection functions
There are several functions that can help to inspect the content of the JSON column: [JSONAllPaths](../functions/json-functions.md#jsonallpaths), [JSONAllPathsWithTypes](../functions/json-functions.md#jsonallpathswithtypes), [JSONDynamicPaths](../functions/json-functions.md#jsondynamicpaths), [JSONDynamicPathsWithTypes](../functions/json-functions.md#jsondynamicpathswithtypes), [JSONSharedDataPaths](../functions/json-functions.md#jsonshareddatapaths), [JSONSharedDataPathsWithTypes](../functions/json-functions.md#jsonshareddatapathswithtypes).
## Tips for better usage of the JSON type
Before creating `JSON` column and loading data into it, consider the following tips:
- Investigate your data and specify as many path hints with types as you can. It will make the storage and the reading much more efficient.
- Think about what paths you will need and what paths you will never need. Specify paths that you won't need in the SKIP section and SKIP REGEXP if needed. It will improve the storage.
- Don't set `max_dynamic_paths` parameter to very high values, it can make the storage and reading less efficient.

View File

@ -1155,3 +1155,207 @@ SELECT jsonMergePatch('{"a":1}', '{"name": "joey"}', '{"name": "tom"}', '{"name"
│ {"a":1,"name":"zoey"} │
└───────────────────────┘
```
### JSONAllPaths
Returns the list of all paths stored in each row in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONAllPaths(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Array(String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONAllPaths(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONAllPaths(json)─┐
│ {"a":"42"} │ ['a'] │
│ {"b":"Hello"} │ ['b'] │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ ['a','c'] │
└──────────────────────────────────────┴────────────────────┘
```
### JSONAllPathsWithTypes
Returns the map of all paths and their data types stored in each row in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONAllPathsWithTypes(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Map(String, String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONAllPathsWithTypes(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONAllPathsWithTypes(json)───────────────┐
│ {"a":"42"} │ {'a':'Int64'} │
│ {"b":"Hello"} │ {'b':'String'} │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ {'a':'Array(Nullable(Int64))','c':'Date'} │
└──────────────────────────────────────┴───────────────────────────────────────────┘
```
### JSONDynamicPaths
Returns the list of dynamic paths that are stored as separate subcolumns in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONDynamicPaths(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Array(String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONDynamicPaths(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONDynamicPaths(json)─┐
| {"a":"42"} │ ['a'] │
│ {"b":"Hello"} │ [] │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ ['a'] │
└──────────────────────────────────────┴────────────────────────┘
```
### JSONDynamicPathsWithTypes
Returns the map of dynamic paths that are stored as separate subcolumns and their types in each row in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONAllPathsWithTypes(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Map(String, String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONDynamicPathsWithTypes(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONDynamicPathsWithTypes(json)─┐
│ {"a":"42"} │ {'a':'Int64'} │
│ {"b":"Hello"} │ {} │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ {'a':'Array(Nullable(Int64))'} │
└──────────────────────────────────────┴─────────────────────────────────┘
```
### JSONSharedDataPaths
Returns the list of paths that are stored in shared data structure in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONSharedDataPaths(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Array(String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONSharedDataPaths(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONSharedDataPaths(json)─┐
│ {"a":"42"} │ [] │
│ {"b":"Hello"} │ ['b'] │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ ['c'] │
└──────────────────────────────────────┴───────────────────────────┘
```
### JSONSharedDataPathsWithTypes
Returns the map of paths that are stored in shared data structure and their types in each row in [JSON](../data-types/newjson.md) column.
**Syntax**
``` sql
JSONSharedDataPathsWithTypes(json)
```
**Arguments**
- `json` — [JSON](../data-types/newjson.md).
**Returned value**
- An array of paths. [Map(String, String)](../data-types/array.md).
**Example**
``` sql
CREATE TABLE test (json JSON(max_dynamic_paths=1)) ENGINE = Memory;
INSERT INTO test FORMAT JSONEachRow {"json" : {"a" : 42}}, {"json" : {"b" : "Hello"}}, {"json" : {"a" : [1, 2, 3], "c" : "2020-01-01"}}
SELECT json, JSONSharedDataPathsWithTypes(json) FROM test;
```
```text
┌─json─────────────────────────────────┬─JSONSharedDataPathsWithTypes(json)─┐
│ {"a":"42"} │ {} │
│ {"b":"Hello"} │ {'b':'String'} │
│ {"a":["1","2","3"],"c":"2020-01-01"} │ {'c':'Date'} │
└──────────────────────────────────────┴────────────────────────────────────┘
```

View File

@ -161,6 +161,8 @@ CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name
REFRESH EVERY|AFTER interval [OFFSET interval]
RANDOMIZE FOR interval
DEPENDS ON [db.]name [, [db.]name [, ...]]
SETTINGS name = value [, name = value [, ...]]
[APPEND]
[TO[db.]name] [(columns)] [ENGINE = engine] [EMPTY]
AS SELECT ...
[COMMENT 'comment']
@ -170,18 +172,23 @@ where `interval` is a sequence of simple intervals:
number SECOND|MINUTE|HOUR|DAY|WEEK|MONTH|YEAR
```
Periodically runs the corresponding query and stores its result in a table, atomically replacing the table's previous contents.
Periodically runs the corresponding query and stores its result in a table.
* If the query says `APPEND`, each refresh inserts rows into the table without deleting existing rows. The insert is not atomic, just like a regular INSERT SELECT.
* Otherwise each refresh atomically replaces the table's previous contents.
Differences from regular non-refreshable materialized views:
* No insert trigger. I.e. when new data is inserted into the table specified in SELECT, it's *not* automatically pushed to the refreshable materialized view. The periodic refresh runs the entire query and replaces the entire table.
* No insert trigger. I.e. when new data is inserted into the table specified in SELECT, it's *not* automatically pushed to the refreshable materialized view. The periodic refresh runs the entire query.
* No restrictions on the SELECT query. Table functions (e.g. `url()`), views, UNION, JOIN, are all allowed.
:::note
The settings in the `REFRESH ... SETTINGS` part of the query are refresh settings (e.g. `refresh_retries`), distinct from regular settings (e.g. `max_threads`). Regular settings can be specified using `SETTINGS` at the end of the query.
:::
:::note
Refreshable materialized views are a work in progress. Setting `allow_experimental_refreshable_materialized_view = 1` is required for creating one. Current limitations:
* not compatible with Replicated database or table engines
* It is not supported in ClickHouse Cloud
* require [Atomic database engine](../../../engines/database-engines/atomic.md),
* no retries for failed refresh - we just skip to the next scheduled refresh time,
* no limit on number of concurrent refreshes.
:::
@ -246,15 +253,22 @@ A few more examples:
`DEPENDS ON` only works between refreshable materialized views. Listing a regular table in the `DEPENDS ON` list will prevent the view from ever refreshing (dependencies can be removed with `ALTER`, see below).
:::
### Settings
Available refresh settings:
* `refresh_retries` - How many times to retry if refresh query fails with an exception. If all retries fail, skip to the next scheduled refresh time. 0 means no retries, -1 means infinite retries. Default: 0.
* `refresh_retry_initial_backoff_ms` - Delay before the first retry, if `refresh_retries` is not zero. Each subsequent retry doubles the delay, up to `refresh_retry_max_backoff_ms`. Default: 100 ms.
* `refresh_retry_max_backoff_ms` - Limit on the exponential growth of delay between refresh attempts. Default: 60000 ms (1 minute).
### Changing Refresh Parameters {#changing-refresh-parameters}
To change refresh parameters:
```
ALTER TABLE [db.]name MODIFY REFRESH EVERY|AFTER ... [RANDOMIZE FOR ...] [DEPENDS ON ...]
ALTER TABLE [db.]name MODIFY REFRESH EVERY|AFTER ... [RANDOMIZE FOR ...] [DEPENDS ON ...] [SETTINGS ...]
```
:::note
This replaces refresh schedule *and* dependencies. If the table had a `DEPENDS ON`, doing a `MODIFY REFRESH` without `DEPENDS ON` will remove the dependencies.
This replaces *all* refresh parameters at once: schedule, dependencies, settings, and APPEND-ness. E.g. if the table had a `DEPENDS ON`, doing a `MODIFY REFRESH` without `DEPENDS ON` will remove the dependencies.
:::
### Other operations
@ -263,6 +277,10 @@ The status of all refreshable materialized views is available in table [`system.
To manually stop, start, trigger, or cancel refreshes use [`SYSTEM STOP|START|REFRESH|CANCEL VIEW`](../system.md#refreshable-materialized-views).
:::note
Fun fact: the refresh query is allowed to read from the view that's being refreshed, seeing pre-refresh version of the data. This means you can implement Conway's game of life: https://pastila.nl/?00021a4b/d6156ff819c83d490ad2dcec05676865#O0LGWTO7maUQIA4AcGUtlA==
:::
## Window View [Experimental]
:::info

View File

@ -526,6 +526,10 @@ Trigger an immediate out-of-schedule refresh of a given view.
SYSTEM REFRESH VIEW [db.]name
```
### REFRESH VIEW
Wait for the currently running refresh to complete. If the refresh fails, throws an exception. If no refresh is running, completes immediately, throwing an exception if previous refresh failed.
### STOP VIEW, STOP VIEWS
Disable periodic refreshing of the given view or all refreshable views. If a refresh is in progress, cancel it too.

View File

@ -146,7 +146,30 @@ SELECT dictGet('dict', 'B', 2);
## Пример использования именованных соединений с базой данных PostgreSQL
Описание параметров смотрите [postgresql](../sql-reference/table-functions/postgresql.md).
Описание параметров смотрите [postgresql](../sql-reference/table-functions/postgresql.md). Дополнительно есть алиасы:
- `username` для `user`
- `db` для `database`.
Параметр `addresses_expr` используется в коллекции вместо `host:port`. Параметр опционален, потому что есть так же другие: `host`, `hostname`, `port`. Следующий псевдокод показывает приоритет:
```sql
CASE
WHEN collection['addresses_expr'] != '' THEN collection['addresses_expr']
WHEN collection['host'] != '' THEN collection['host'] || ':' || if(collection['port'] != '', collection['port'], '5432')
WHEN collection['hostname'] != '' THEN collection['hostname'] || ':' || if(collection['port'] != '', collection['port'], '5432')
END
```
Пример создания:
```sql
CREATE NAMED COLLECTION mypg AS
user = 'pguser',
password = 'jw8s0F4',
host = '127.0.0.1',
port = 5432,
database = 'test',
schema = 'test_schema'
```
Пример конфигурации:
```xml
@ -199,6 +222,10 @@ SELECT * FROM mypgtable;
└───┘
```
:::note
PostgreSQL копирует данные из named collection при создании таблицы. Изменения в коллекции не влияют на существующие таблицы.
:::
### Пример использования именованных соединений базой данных с движком PostgreSQL
```sql

View File

@ -175,6 +175,11 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
hash_func.update(options["seed"].as<std::string>());
}
SharedContextHolder shared_context = Context::createShared();
auto context = Context::createGlobal(shared_context.get());
auto context_const = WithContext(context).getContext();
context->makeGlobalContext();
registerInterpreters();
registerFunctions();
registerAggregateFunctions();

View File

@ -93,7 +93,7 @@ namespace
break;
}
UUID id = parse<UUID>(line);
UUID id = parse<UUID>(line.substr(0, line.find('\t')));
line.clear();
String queries;

View File

@ -1,5 +1,5 @@
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/NestedUtils.h>
@ -452,10 +452,10 @@ QueryTreeNodePtr IdentifierResolver::tryResolveIdentifierFromCompoundExpression(
if (auto * column = compound_expression->as<ColumnNode>())
{
const DataTypePtr & column_type = column->getColumn().getTypeInStorage();
if (column_type->getTypeId() == TypeIndex::Object)
if (column_type->getTypeId() == TypeIndex::ObjectDeprecated)
{
const auto * object_type = checkAndGetDataType<DataTypeObject>(column_type.get());
if (object_type->getSchemaFormat() == "json" && object_type->hasNullableSubcolumns())
const auto & object_type = checkAndGetDataType<DataTypeObjectDeprecated>(*column_type);
if (object_type.getSchemaFormat() == "json" && object_type.hasNullableSubcolumns())
{
QueryTreeNodePtr constant_node_null = std::make_shared<ConstantNode>(Field());
return constant_node_null;
@ -1000,7 +1000,6 @@ QueryTreeNodePtr IdentifierResolver::tryResolveIdentifierFromJoin(const Identifi
if (!join_node_in_resolve_process && from_join_node.isUsingJoinExpression())
{
auto & join_using_list = from_join_node.getJoinExpression()->as<ListNode &>();
for (auto & join_using_node : join_using_list.getNodes())
{
auto & column_node = join_using_node->as<ColumnNode &>();

View File

@ -3,7 +3,7 @@
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeMap.h>

View File

@ -452,6 +452,11 @@ void ColumnArray::reserve(size_t n)
getData().reserve(n); /// The average size of arrays is not taken into account here. Or it is considered to be no more than 1.
}
size_t ColumnArray::capacity() const
{
return getOffsets().capacity();
}
void ColumnArray::prepareForSquashing(const Columns & source_columns)
{
size_t new_size = size();

View File

@ -118,6 +118,7 @@ public:
void updatePermutationWithCollation(const Collator & collator, PermutationSortDirection direction, PermutationSortStability stability,
size_t limit, int nan_direction_hint, Permutation & res, EqualRanges& equal_ranges) const override;
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override;
void ensureOwnership() override;

View File

@ -53,6 +53,7 @@ public:
size_t allocatedBytes() const override { return data.allocated_bytes(); }
void protect() override { data.protect(); }
void reserve(size_t n) override { data.reserve_exact(n); }
size_t capacity() const override { return data.capacity(); }
void shrinkToFit() override { data.shrink_to_fit(); }
#if !defined(DEBUG_OR_SANITIZER_BUILD)

View File

@ -16,7 +16,6 @@
#include <IO/ReadBufferFromMemory.h>
#include <IO/WriteBufferFromString.h>
#include <Formats/FormatSettings.h>
#include <Common/logger_useful.h>
namespace DB
{
@ -56,6 +55,7 @@ ColumnDynamic::ColumnDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_
ColumnDynamic::ColumnDynamic(
MutableColumnPtr variant_column_, const DataTypePtr & variant_type_, size_t max_dynamic_types_, size_t global_max_dynamic_types_, const StatisticsPtr & statistics_)
: variant_column(std::move(variant_column_))
, variant_column_ptr(assert_cast<ColumnVariant *>(variant_column.get()))
, max_dynamic_types(max_dynamic_types_)
, global_max_dynamic_types(global_max_dynamic_types_)
, statistics(statistics_)
@ -66,6 +66,7 @@ ColumnDynamic::ColumnDynamic(
ColumnDynamic::ColumnDynamic(
MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, size_t global_max_dynamic_types_, const StatisticsPtr & statistics_)
: variant_column(std::move(variant_column_))
, variant_column_ptr(assert_cast<ColumnVariant *>(variant_column.get()))
, variant_info(variant_info_)
, max_dynamic_types(max_dynamic_types_)
, global_max_dynamic_types(global_max_dynamic_types_)
@ -79,6 +80,7 @@ void ColumnDynamic::setVariantType(const DataTypePtr & variant_type)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Setting specific variant type is allowed only for empty dynamic column");
variant_column = variant_type->createColumn();
variant_column_ptr = assert_cast<ColumnVariant *>(variant_column.get());
createVariantInfo(variant_type);
}
@ -313,12 +315,12 @@ void ColumnDynamic::doInsertFrom(const IColumn & src_, size_t n)
/// Check if we have the same variants in both columns.
if (variant_info.variant_name == dynamic_src.variant_info.variant_name)
{
variant_column->insertFrom(*dynamic_src.variant_column, n);
variant_column_ptr->insertFrom(*dynamic_src.variant_column, n);
return;
}
auto & variant_col = assert_cast<ColumnVariant &>(*variant_column);
const auto & src_variant_col = assert_cast<const ColumnVariant &>(*dynamic_src.variant_column);
auto & variant_col = getVariantColumn();
const auto & src_variant_col = dynamic_src.getVariantColumn();
auto src_global_discr = src_variant_col.globalDiscriminatorAt(n);
auto src_offset = src_variant_col.offsetAt(n);
@ -386,16 +388,15 @@ void ColumnDynamic::doInsertRangeFrom(const IColumn & src_, size_t start, size_t
"[start({}) + length({}) > src.size()({})]", start, length, src_.size());
const auto & dynamic_src = assert_cast<const ColumnDynamic &>(src_);
auto & variant_col = getVariantColumn();
/// Check if we have the same variants in both columns.
if (variant_info.variant_names == dynamic_src.variant_info.variant_names)
{
variant_column->insertRangeFrom(*dynamic_src.variant_column, start, length);
variant_col.insertRangeFrom(*dynamic_src.variant_column, start, length);
return;
}
auto & variant_col = assert_cast<ColumnVariant &>(*variant_column);
/// If variants are different, we need to extend our variant with new variants.
if (auto * global_discriminators_mapping = combineVariants(dynamic_src.variant_info))
{
@ -602,15 +603,15 @@ void ColumnDynamic::doInsertManyFrom(const IColumn & src_, size_t position, size
#endif
{
const auto & dynamic_src = assert_cast<const ColumnDynamic &>(src_);
auto & variant_col = getVariantColumn();
/// Check if we have the same variants in both columns.
if (variant_info.variant_names == dynamic_src.variant_info.variant_names)
{
variant_column->insertManyFrom(*dynamic_src.variant_column, position, length);
variant_col.insertManyFrom(*dynamic_src.variant_column, position, length);
return;
}
auto & variant_col = assert_cast<ColumnVariant &>(*variant_column);
const auto & src_variant_col = assert_cast<const ColumnVariant &>(*dynamic_src.variant_column);
auto src_global_discr = src_variant_col.globalDiscriminatorAt(position);
auto src_offset = src_variant_col.offsetAt(position);
@ -751,7 +752,7 @@ StringRef ColumnDynamic::serializeValueIntoArena(size_t n, Arena & arena, const
const char * ColumnDynamic::deserializeAndInsertFromArena(const char * pos)
{
auto & variant_col = assert_cast<ColumnVariant &>(*variant_column);
auto & variant_col = getVariantColumn();
UInt8 null_bit = unalignedLoad<UInt8>(pos);
pos += sizeof(UInt8);
if (null_bit)
@ -808,7 +809,7 @@ const char * ColumnDynamic::skipSerializedInArena(const char * pos) const
void ColumnDynamic::updateHashWithValue(size_t n, SipHash & hash) const
{
const auto & variant_col = assert_cast<const ColumnVariant &>(*variant_column);
const auto & variant_col = getVariantColumn();
auto discr = variant_col.globalDiscriminatorAt(n);
if (discr == ColumnVariant::NULL_DISCRIMINATOR)
{
@ -826,9 +827,9 @@ int ColumnDynamic::compareAt(size_t n, size_t m, const IColumn & rhs, int nan_di
int ColumnDynamic::doCompareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const
#endif
{
const auto & left_variant = assert_cast<const ColumnVariant &>(*variant_column);
const auto & left_variant = getVariantColumn();
const auto & right_dynamic = assert_cast<const ColumnDynamic &>(rhs);
const auto & right_variant = assert_cast<const ColumnVariant &>(*right_dynamic.variant_column);
const auto & right_variant = right_dynamic.getVariantColumn();
auto left_discr = left_variant.globalDiscriminatorAt(n);
auto left_shared_variant_discr = getSharedVariantDiscriminator();
@ -970,7 +971,7 @@ void ColumnDynamic::updatePermutation(IColumn::PermutationSortDirection directio
ColumnPtr ColumnDynamic::compress() const
{
ColumnPtr variant_compressed = variant_column->compress();
ColumnPtr variant_compressed = variant_column_ptr->compress();
size_t byte_size = variant_compressed->byteSize();
return ColumnCompressed::create(size(), byte_size,
[my_variant_compressed = std::move(variant_compressed), my_variant_info = variant_info, my_max_dynamic_types = max_dynamic_types, my_global_max_dynamic_types = global_max_dynamic_types, my_statistics = statistics]() mutable
@ -998,7 +999,18 @@ void ColumnDynamic::prepareForSquashing(const Columns & source_columns)
variant_col.getLocalDiscriminators().reserve_exact(new_size);
variant_col.getOffsets().reserve_exact(new_size);
/// Second, collect all variants and their total sizes.
/// Second, preallocate memory for variants.
prepareVariantsForSquashing(source_columns);
}
void ColumnDynamic::prepareVariantsForSquashing(const Columns & source_columns)
{
/// Internal variants of source dynamic columns may differ.
/// We want to preallocate memory for all variants we will have after squashing.
/// It may happen that the total number of variants in source columns will
/// exceed the limit, in this case we will choose the most frequent variants.
/// Collect all variants and their total sizes.
std::unordered_map<String, size_t> total_variant_sizes;
DataTypes all_variants;
@ -1072,6 +1084,7 @@ void ColumnDynamic::prepareForSquashing(const Columns & source_columns)
/// Now current dynamic column has all resulting variants and we can call
/// prepareForSquashing on them to preallocate the memory.
auto & variant_col = getVariantColumn();
for (size_t i = 0; i != variant_info.variant_names.size(); ++i)
{
Columns source_variant_columns;
@ -1240,12 +1253,12 @@ void ColumnDynamic::takeDynamicStructureFromSourceColumns(const Columns & source
void ColumnDynamic::applyNullMap(const ColumnVector<UInt8>::Container & null_map)
{
assert_cast<ColumnVariant &>(*variant_column).applyNullMap(null_map);
variant_column_ptr->applyNullMap(null_map);
}
void ColumnDynamic::applyNegatedNullMap(const ColumnVector<UInt8>::Container & null_map)
{
assert_cast<ColumnVariant &>(*variant_column).applyNegatedNullMap(null_map);
variant_column_ptr->applyNegatedNullMap(null_map);
}
}

View File

@ -106,7 +106,7 @@ public:
return create(variant_column_->assumeMutable(), variant_type, max_dynamic_types_, global_max_dynamic_types_, statistics_);
}
static MutablePtr create(size_t max_dynamic_types_)
static MutablePtr create(size_t max_dynamic_types_ = MAX_DYNAMIC_TYPES_LIMIT)
{
return Base::create(max_dynamic_types_);
}
@ -136,7 +136,7 @@ public:
size_t size() const override
{
return variant_column->size();
return variant_column_ptr->size();
}
Field operator[](size_t n) const override;
@ -145,22 +145,22 @@ public:
bool isDefaultAt(size_t n) const override
{
return variant_column->isDefaultAt(n);
return variant_column_ptr->isDefaultAt(n);
}
bool isNullAt(size_t n) const override
{
return variant_column->isNullAt(n);
return variant_column_ptr->isNullAt(n);
}
StringRef getDataAt(size_t n) const override
{
return variant_column->getDataAt(n);
return variant_column_ptr->getDataAt(n);
}
void insertData(const char * pos, size_t length) override
{
variant_column->insertData(pos, length);
variant_column_ptr->insertData(pos, length);
}
void insert(const Field & x) override;
@ -178,17 +178,17 @@ public:
void insertDefault() override
{
variant_column->insertDefault();
variant_column_ptr->insertDefault();
}
void insertManyDefaults(size_t length) override
{
variant_column->insertManyDefaults(length);
variant_column_ptr->insertManyDefaults(length);
}
void popBack(size_t n) override
{
variant_column->popBack(n);
variant_column_ptr->popBack(n);
}
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
@ -199,42 +199,42 @@ public:
WeakHash32 getWeakHash32() const override
{
return variant_column->getWeakHash32();
return variant_column_ptr->getWeakHash32();
}
void updateHashFast(SipHash & hash) const override
{
variant_column->updateHashFast(hash);
variant_column_ptr->updateHashFast(hash);
}
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override
{
return create(variant_column->filter(filt, result_size_hint), variant_info, max_dynamic_types, global_max_dynamic_types);
return create(variant_column_ptr->filter(filt, result_size_hint), variant_info, max_dynamic_types, global_max_dynamic_types);
}
void expand(const Filter & mask, bool inverted) override
{
variant_column->expand(mask, inverted);
variant_column_ptr->expand(mask, inverted);
}
ColumnPtr permute(const Permutation & perm, size_t limit) const override
{
return create(variant_column->permute(perm, limit), variant_info, max_dynamic_types, global_max_dynamic_types);
return create(variant_column_ptr->permute(perm, limit), variant_info, max_dynamic_types, global_max_dynamic_types);
}
ColumnPtr index(const IColumn & indexes, size_t limit) const override
{
return create(variant_column->index(indexes, limit), variant_info, max_dynamic_types, global_max_dynamic_types);
return create(variant_column_ptr->index(indexes, limit), variant_info, max_dynamic_types, global_max_dynamic_types);
}
ColumnPtr replicate(const Offsets & replicate_offsets) const override
{
return create(variant_column->replicate(replicate_offsets), variant_info, max_dynamic_types, global_max_dynamic_types);
return create(variant_column_ptr->replicate(replicate_offsets), variant_info, max_dynamic_types, global_max_dynamic_types);
}
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override
{
MutableColumns scattered_variant_columns = variant_column->scatter(num_columns, selector);
MutableColumns scattered_variant_columns = variant_column_ptr->scatter(num_columns, selector);
MutableColumns scattered_columns;
scattered_columns.reserve(num_columns);
for (auto & scattered_variant_column : scattered_variant_columns)
@ -251,12 +251,12 @@ public:
bool hasEqualValues() const override
{
return variant_column->hasEqualValues();
return variant_column_ptr->hasEqualValues();
}
void getExtremes(Field & min, Field & max) const override
{
variant_column->getExtremes(min, max);
variant_column_ptr->getExtremes(min, max);
}
void getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
@ -267,44 +267,53 @@ public:
void reserve(size_t n) override
{
variant_column->reserve(n);
variant_column_ptr->reserve(n);
}
size_t capacity() const override
{
return variant_column_ptr->capacity();
}
void prepareForSquashing(const Columns & source_columns) override;
/// Prepare only variants but not discriminators and offsets.
void prepareVariantsForSquashing(const Columns & source_columns);
void ensureOwnership() override
{
variant_column->ensureOwnership();
variant_column_ptr->ensureOwnership();
}
size_t byteSize() const override
{
return variant_column->byteSize();
return variant_column_ptr->byteSize();
}
size_t byteSizeAt(size_t n) const override
{
return variant_column->byteSizeAt(n);
return variant_column_ptr->byteSizeAt(n);
}
size_t allocatedBytes() const override
{
return variant_column->allocatedBytes();
return variant_column_ptr->allocatedBytes();
}
void protect() override
{
variant_column->protect();
variant_column_ptr->protect();
}
void forEachSubcolumn(MutableColumnCallback callback) override
{
callback(variant_column);
variant_column_ptr = assert_cast<ColumnVariant *>(variant_column.get());
}
void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override
{
callback(*variant_column);
variant_column_ptr = assert_cast<ColumnVariant *>(variant_column.get());
variant_column->forEachSubcolumnRecursively(callback);
}
@ -319,27 +328,27 @@ public:
double getRatioOfDefaultRows(double sample_ratio) const override
{
return variant_column->getRatioOfDefaultRows(sample_ratio);
return variant_column_ptr->getRatioOfDefaultRows(sample_ratio);
}
UInt64 getNumberOfDefaultRows() const override
{
return variant_column->getNumberOfDefaultRows();
return variant_column_ptr->getNumberOfDefaultRows();
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
variant_column->getIndicesOfNonDefaultRows(indices, from, limit);
variant_column_ptr->getIndicesOfNonDefaultRows(indices, from, limit);
}
void finalize() override
{
variant_column->finalize();
variant_column_ptr->finalize();
}
bool isFinalized() const override
{
return variant_column->isFinalized();
return variant_column_ptr->isFinalized();
}
/// Apply null map to a nested Variant column.
@ -351,8 +360,8 @@ public:
const ColumnPtr & getVariantColumnPtr() const { return variant_column; }
ColumnPtr & getVariantColumnPtr() { return variant_column; }
const ColumnVariant & getVariantColumn() const { return assert_cast<const ColumnVariant &>(*variant_column); }
ColumnVariant & getVariantColumn() { return assert_cast<ColumnVariant &>(*variant_column); }
const ColumnVariant & getVariantColumn() const { return *variant_column_ptr; }
ColumnVariant & getVariantColumn() { return *variant_column_ptr; }
bool addNewVariant(const DataTypePtr & new_variant, const String & new_variant_name);
bool addNewVariant(const DataTypePtr & new_variant) { return addNewVariant(new_variant, new_variant->getName()); }
@ -420,6 +429,7 @@ public:
}
const SerializationPtr & getVariantSerialization(const DataTypePtr & variant_type) const { return getVariantSerialization(variant_type, variant_type->getName()); }
private:
void createVariantInfo(const DataTypePtr & variant_type);
@ -432,6 +442,10 @@ private:
void updateVariantInfoAndExpandVariantColumn(const DataTypePtr & new_variant_type);
WrappedPtr variant_column;
/// Store and use pointer to ColumnVariant to avoid virtual calls.
/// ColumnDynamic is widely used inside ColumnObject for each path and
/// with hundreds of paths these virtual calls are noticeable.
ColumnVariant * variant_column_ptr;
/// Store the type of current variant with some additional information.
VariantInfo variant_info;
/// The maximum number of different types that can be stored in this Dynamic column.

View File

@ -182,6 +182,11 @@ public:
chars.reserve_exact(n * size);
}
size_t capacity() const override
{
return chars.capacity() / n;
}
void shrinkToFit() override
{
chars.shrink_to_fit();

View File

@ -172,6 +172,7 @@ public:
}
void reserve(size_t n) override { idx.reserve(n); }
size_t capacity() const override { return idx.capacity(); }
void shrinkToFit() override { idx.shrinkToFit(); }
/// Don't count the dictionary size as it can be shared between different blocks.
@ -309,6 +310,7 @@ public:
void popBack(size_t n) { positions->popBack(n); }
void reserve(size_t n) { positions->reserve(n); }
size_t capacity() const { return positions->capacity(); }
void shrinkToFit() { positions->shrinkToFit(); }
UInt64 getMaxPositionForCurrentType() const;

View File

@ -249,6 +249,11 @@ void ColumnMap::reserve(size_t n)
nested->reserve(n);
}
size_t ColumnMap::capacity() const
{
return nested->capacity();
}
void ColumnMap::prepareForSquashing(const Columns & source_columns)
{
Columns nested_source_columns;

View File

@ -94,6 +94,7 @@ public:
void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override;
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override;
void ensureOwnership() override;

View File

@ -706,6 +706,11 @@ void ColumnNullable::reserve(size_t n)
getNullMapData().reserve(n);
}
size_t ColumnNullable::capacity() const
{
return getNullMapData().capacity();
}
void ColumnNullable::prepareForSquashing(const Columns & source_columns)
{
size_t new_size = size();

View File

@ -125,6 +125,7 @@ public:
size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_ranges) const override;
size_t estimateCardinalityInPermutedRange(const Permutation & permutation, const EqualRange & equal_range) const override;
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override;
void ensureOwnership() override;

File diff suppressed because it is too large Load Diff

View File

@ -1,216 +1,117 @@
#pragma once
#include <Columns/IColumn.h>
#include <Core/Field.h>
#include <Core/Names.h>
#include <DataTypes/Serializations/SubcolumnsTree.h>
#include <Common/PODArray.h>
#include <Common/WeakHash.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnString.h>
#include <DataTypes/IDataType.h>
#include <DataTypes/Serializations/SerializationDynamic.h>
#include <Formats/FormatSettings.h>
#include <Common/StringHashForHeterogeneousLookup.h>
#include <Common/WeakHash.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
/// Info that represents a scalar or array field in a decomposed view.
/// It allows to recreate field with different number
/// of dimensions or nullability.
struct FieldInfo
{
/// The common type of of all scalars in field.
DataTypePtr scalar_type;
/// Do we have NULL scalar in field.
bool have_nulls;
/// If true then we have scalars with different types in array and
/// we need to convert scalars to the common type.
bool need_convert;
/// Number of dimension in array. 0 if field is scalar.
size_t num_dimensions;
/// If true then this field is an array of variadic dimension field
/// and we need to normalize the dimension
bool need_fold_dimension;
};
FieldInfo getFieldInfo(const Field & field);
/** A column that represents object with dynamic set of subcolumns.
* Subcolumns are identified by paths in document and are stored in
* a trie-like structure. ColumnObject is not suitable for writing into tables
* and it should be converted to Tuple with fixed set of subcolumns before that.
*/
class ColumnObject final : public COWHelper<IColumnHelper<ColumnObject>, ColumnObject>
{
public:
/** Class that represents one subcolumn.
* It stores values in several parts of column
* and keeps current common type of all parts.
* We add a new column part with a new type, when we insert a field,
* which can't be converted to the current common type.
* After insertion of all values subcolumn should be finalized
* for writing and other operations.
*/
class Subcolumn
struct Statistics
{
public:
Subcolumn() = default;
Subcolumn(size_t size_, bool is_nullable_);
Subcolumn(MutableColumnPtr && data_, bool is_nullable_);
size_t size() const;
size_t byteSize() const;
size_t allocatedBytes() const;
void get(size_t n, Field & res) const;
bool isFinalized() const;
const DataTypePtr & getLeastCommonType() const { return least_common_type.get(); }
const DataTypePtr & getLeastCommonTypeBase() const { return least_common_type.getBase(); }
size_t getNumberOfDimensions() const { return least_common_type.getNumberOfDimensions(); }
/// Checks the consistency of column's parts stored in @data.
void checkTypes() const;
/// Inserts a field, which scalars can be arbitrary, but number of
/// dimensions should be consistent with current common type.
void insert(Field field);
void insert(Field field, FieldInfo info);
void insertDefault();
void insertManyDefaults(size_t length);
void insertRangeFrom(const Subcolumn & src, size_t start, size_t length);
void popBack(size_t n);
Subcolumn cut(size_t start, size_t length) const;
/// Converts all column's parts to the common type and
/// creates a single column that stores all values.
void finalize();
/// Returns last inserted field.
Field getLastField() const;
FieldInfo getFieldInfo() const;
/// Recreates subcolumn with default scalar values and keeps sizes of arrays.
/// Used to create columns of type Nested with consistent array sizes.
Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const;
/// Returns single column if subcolumn in finalizes.
/// Otherwise -- undefined behaviour.
IColumn & getFinalizedColumn();
const IColumn & getFinalizedColumn() const;
const ColumnPtr & getFinalizedColumnPtr() const;
const std::vector<WrappedPtr> & getData() const { return data; }
size_t getNumberOfDefaultsInPrefix() const { return num_of_defaults_in_prefix; }
friend class ColumnObject;
private:
class LeastCommonType
enum class Source
{
public:
LeastCommonType();
explicit LeastCommonType(DataTypePtr type_);
const DataTypePtr & get() const { return type; }
const DataTypePtr & getBase() const { return base_type; }
size_t getNumberOfDimensions() const { return num_dimensions; }
private:
DataTypePtr type;
DataTypePtr base_type;
size_t num_dimensions = 0;
READ, /// Statistics were loaded into column during reading from MergeTree.
MERGE, /// Statistics were calculated during merge of several MergeTree parts.
};
void addNewColumnPart(DataTypePtr type);
explicit Statistics(Source source_) : source(source_) {}
/// Current least common type of all values inserted to this subcolumn.
LeastCommonType least_common_type;
/// If true then common type type of subcolumn is Nullable
/// and default values are NULLs.
bool is_nullable = false;
/// Parts of column. Parts should be in increasing order in terms of subtypes/supertypes.
/// That means that the least common type for i-th prefix is the type of i-th part
/// and it's the supertype for all type of column from 0 to i-1.
std::vector<WrappedPtr> data;
/// Until we insert any non-default field we don't know further
/// least common type and we count number of defaults in prefix,
/// which will be converted to the default type of final common type.
size_t num_of_defaults_in_prefix = 0;
size_t num_rows = 0;
/// Source of the statistics.
Source source;
/// Statistics for dynamic paths: (path) -> (total number of not-null values).
std::unordered_map<String, size_t> dynamic_paths_statistics;
/// Statistics for paths in shared data: path) -> (total number of not-null values).
/// We don't store statistics for all paths in shared data but only for some subset of them
/// (is 10000 a good limit? It should not be expensive to store 10000 paths per part)
static const size_t MAX_SHARED_DATA_STATISTICS_SIZE = 10000;
std::unordered_map<String, size_t, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal> shared_data_paths_statistics;
};
using Subcolumns = SubcolumnsTree<Subcolumn>;
using StatisticsPtr = std::shared_ptr<const Statistics>;
private:
/// If true then all subcolumns are nullable.
const bool is_nullable;
friend class COWHelper<IColumnHelper<ColumnObject>, ColumnObject>;
Subcolumns subcolumns;
size_t num_rows;
ColumnObject(std::unordered_map<String, MutableColumnPtr> typed_paths_, size_t max_dynamic_paths_, size_t max_dynamic_types_);
ColumnObject(
std::unordered_map<String, MutableColumnPtr> typed_paths_,
std::unordered_map<String, MutableColumnPtr> dynamic_paths_,
MutableColumnPtr shared_data_,
size_t max_dynamic_paths_,
size_t global_max_dynamic_paths_,
size_t max_dynamic_types_,
const StatisticsPtr & statistics_ = {});
/// Use StringHashForHeterogeneousLookup hash for hash maps to be able to use std::string_view in find() method.
using PathToColumnMap = std::unordered_map<String, WrappedPtr, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal>;
using PathToDynamicColumnPtrMap = std::unordered_map<String, ColumnDynamic *, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal>;
public:
static constexpr auto COLUMN_NAME_DUMMY = "_dummy";
/** Create immutable column using immutable arguments. This arguments may be shared with other columns.
* Use mutate in order to make mutable column and mutate shared nested columns.
*/
using Base = COWHelper<IColumnHelper<ColumnObject>, ColumnObject>;
explicit ColumnObject(bool is_nullable_);
ColumnObject(Subcolumns && subcolumns_, bool is_nullable_);
static Ptr create(
const std::unordered_map<String, ColumnPtr> & typed_paths_,
const std::unordered_map<String, ColumnPtr> & dynamic_paths_,
const ColumnPtr & shared_data_,
size_t max_dynamic_paths_,
size_t global_max_dynamic_paths_,
size_t max_dynamic_types_,
const StatisticsPtr & statistics_ = {});
/// Checks that all subcolumns have consistent sizes.
void checkConsistency() const;
static MutablePtr create(
std::unordered_map<String, MutableColumnPtr> typed_paths_,
std::unordered_map<String, MutableColumnPtr> dynamic_paths_,
MutableColumnPtr shared_data_,
size_t max_dynamic_paths_,
size_t global_max_dynamic_paths_,
size_t max_dynamic_types_,
const StatisticsPtr & statistics_ = {});
bool hasSubcolumn(const PathInData & key) const;
static MutablePtr create(std::unordered_map<String, MutableColumnPtr> typed_paths_, size_t max_dynamic_paths_, size_t max_dynamic_types_);
const Subcolumn & getSubcolumn(const PathInData & key) const;
Subcolumn & getSubcolumn(const PathInData & key);
std::string getName() const override;
void incrementNumRows() { ++num_rows; }
const char * getFamilyName() const override
{
return "Object";
}
/// Adds a subcolumn from existing IColumn.
void addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn);
TypeIndex getDataType() const override
{
return TypeIndex::Object;
}
/// Adds a subcolumn of specific size with default values.
void addSubcolumn(const PathInData & key, size_t new_size);
MutableColumnPtr cloneEmpty() const override;
MutableColumnPtr cloneResized(size_t size) const override;
/// Adds a subcolumn of type Nested of specific size with default values.
/// It cares about consistency of sizes of Nested arrays.
void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size);
size_t size() const override
{
return shared_data->size();
}
/// Finds a subcolumn from the same Nested type as @entry and inserts
/// an array with default values with consistent sizes as in Nested type.
bool tryInsertDefaultFromNested(const Subcolumns::NodePtr & entry) const;
bool tryInsertManyDefaultsFromNested(const Subcolumns::NodePtr & entry) const;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
const Subcolumns & getSubcolumns() const { return subcolumns; }
Subcolumns & getSubcolumns() { return subcolumns; }
PathsInData getKeys() const;
/// Part of interface
const char * getFamilyName() const override { return "Object"; }
TypeIndex getDataType() const override { return TypeIndex::Object; }
size_t size() const override;
size_t byteSize() const override;
size_t allocatedBytes() const override;
void forEachSubcolumn(MutableColumnCallback callback) override;
void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override;
void insert(const Field & field) override;
bool tryInsert(const Field & field) override;
void insertDefault() override;
bool isDefaultAt(size_t n) const override;
StringRef getDataAt(size_t n) const override;
void insertData(const char * pos, size_t length) override;
void insert(const Field & x) override;
bool tryInsert(const Field & x) override;
#if !defined(DEBUG_OR_SANITIZER_BUILD)
void insertFrom(const IColumn & src, size_t n) override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
@ -218,24 +119,31 @@ public:
void doInsertFrom(const IColumn & src, size_t n) override;
void doInsertRangeFrom(const IColumn & src, size_t start, size_t length) override;
#endif
/// TODO: implement more optimal insertManyFrom
void insertDefault() override;
void insertManyDefaults(size_t length) override;
void popBack(size_t length) override;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
void popBack(size_t n) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr filter(const Filter & filter, ssize_t result_size_hint) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
ColumnPtr replicate(const Offsets & offsets) const override;
MutableColumnPtr cloneResized(size_t new_size) const override;
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override;
/// Finalizes all subcolumns.
void finalize() override;
bool isFinalized() const override;
/// Order of rows in ColumnObject is undefined.
void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation & res) const override;
void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &) const override;
void updatePermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &, EqualRanges &) const override {}
/// Values of ColumnObject are not comparable.
#if !defined(DEBUG_OR_SANITIZER_BUILD)
int compareAt(size_t, size_t, const IColumn &, int) const override { return 0; }
#else
@ -243,35 +151,118 @@ public:
#endif
void getExtremes(Field & min, Field & max) const override;
/// All other methods throw exception.
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const std::vector<ColumnPtr> & source_columns) override;
void ensureOwnership() override;
size_t byteSize() const override;
size_t byteSizeAt(size_t n) const override;
size_t allocatedBytes() const override;
void protect() override;
StringRef getDataAt(size_t) const override { throwMustBeConcrete(); }
bool isDefaultAt(size_t) const override { throwMustBeConcrete(); }
void insertData(const char *, size_t) override { throwMustBeConcrete(); }
StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeConcrete(); }
char * serializeValueIntoMemory(size_t, char *) const override { throwMustBeConcrete(); }
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeConcrete(); }
const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); }
WeakHash32 getWeakHash32() const override { throwMustBeConcrete(); }
void updateHashFast(SipHash & hash) const override;
void expand(const Filter &, bool) override { throwMustBeConcrete(); }
bool hasEqualValues() const override { throwMustBeConcrete(); }
size_t byteSizeAt(size_t) const override { throwMustBeConcrete(); }
double getRatioOfDefaultRows(double) const override { throwMustBeConcrete(); }
UInt64 getNumberOfDefaultRows() const override { throwMustBeConcrete(); }
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeConcrete(); }
void forEachSubcolumn(MutableColumnCallback callback) override;
private:
[[noreturn]] static void throwMustBeConcrete()
void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override;
bool structureEquals(const IColumn & rhs) const override;
ColumnPtr compress() const override;
void finalize() override;
bool isFinalized() const override;
bool hasDynamicStructure() const override { return true; }
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
const PathToColumnMap & getTypedPaths() const { return typed_paths; }
PathToColumnMap & getTypedPaths() { return typed_paths; }
const PathToColumnMap & getDynamicPaths() const { return dynamic_paths; }
PathToColumnMap & getDynamicPaths() { return dynamic_paths; }
const PathToDynamicColumnPtrMap & getDynamicPathsPtrs() const { return dynamic_paths_ptrs; }
PathToDynamicColumnPtrMap & getDynamicPathsPtrs() { return dynamic_paths_ptrs; }
const StatisticsPtr & getStatistics() const { return statistics; }
const ColumnPtr & getSharedDataPtr() const { return shared_data; }
ColumnPtr & getSharedDataPtr() { return shared_data; }
IColumn & getSharedDataColumn() { return *shared_data; }
const ColumnArray & getSharedDataNestedColumn() const { return assert_cast<const ColumnArray &>(*shared_data); }
ColumnArray & getSharedDataNestedColumn() { return assert_cast<ColumnArray &>(*shared_data); }
ColumnArray::Offsets & getSharedDataOffsets() { return assert_cast<ColumnArray &>(*shared_data).getOffsets(); }
const ColumnArray::Offsets & getSharedDataOffsets() const { return assert_cast<const ColumnArray &>(*shared_data).getOffsets(); }
std::pair<ColumnString *, ColumnString *> getSharedDataPathsAndValues()
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ColumnObject must be converted to ColumnTuple before use");
auto & column_array = assert_cast<ColumnArray &>(*shared_data);
auto & column_tuple = assert_cast<ColumnTuple &>(column_array.getData());
return {assert_cast<ColumnString *>(&column_tuple.getColumn(0)), assert_cast<ColumnString *>(&column_tuple.getColumn(1))};
}
template <typename Func>
MutableColumnPtr applyForSubcolumns(Func && func) const;
std::pair<const ColumnString *, const ColumnString *> getSharedDataPathsAndValues() const
{
const auto & column_array = assert_cast<const ColumnArray &>(*shared_data);
const auto & column_tuple = assert_cast<const ColumnTuple &>(column_array.getData());
return {assert_cast<const ColumnString *>(&column_tuple.getColumn(0)), assert_cast<const ColumnString *>(&column_tuple.getColumn(1))};
}
/// It's used to get shared sized of Nested to insert correct default values.
const Subcolumns::Node * getLeafOfTheSameNested(const Subcolumns::NodePtr & entry) const;
size_t getMaxDynamicTypes() const { return max_dynamic_types; }
size_t getMaxDynamicPaths() const { return max_dynamic_paths; }
size_t getGlobalMaxDynamicPaths() const { return global_max_dynamic_paths; }
/// Try to add new dynamic path. Returns pointer to the new dynamic
/// path column or nullptr if limit on dynamic paths is reached.
ColumnDynamic * tryToAddNewDynamicPath(std::string_view path);
/// Throws an exception if cannot add.
void addNewDynamicPath(std::string_view path);
void setDynamicPaths(const std::vector<String> & paths);
void setMaxDynamicPaths(size_t max_dynamic_paths_);
void setStatistics(const StatisticsPtr & statistics_) { statistics = statistics_; }
void serializePathAndValueIntoSharedData(ColumnString * shared_data_paths, ColumnString * shared_data_values, std::string_view path, const IColumn & column, size_t n);
void deserializeValueFromSharedData(const ColumnString * shared_data_values, size_t n, IColumn & column) const;
/// Paths in shared data are sorted in each row. Use this method to find the lower bound for specific path in the row.
static size_t findPathLowerBoundInSharedData(StringRef path, const ColumnString & shared_data_paths, size_t start, size_t end);
/// Insert all the data from shared data with specified path to dynamic column.
static void fillPathColumnFromSharedData(IColumn & path_column, StringRef path, const ColumnPtr & shared_data_column, size_t start, size_t end);
private:
void insertFromSharedDataAndFillRemainingDynamicPaths(const ColumnObject & src_object_column, std::vector<std::string_view> && src_dynamic_paths_for_shared_data, size_t start, size_t length);
void serializePathAndValueIntoArena(Arena & arena, const char *& begin, StringRef path, StringRef value, StringRef & res) const;
/// Map path -> column for paths with explicitly specified types.
/// This set of paths is constant and cannot be changed.
PathToColumnMap typed_paths;
/// Map path -> column for dynamically added paths. All columns
/// here are Dynamic columns. This set of paths can be extended
/// during inerts into the column.
PathToColumnMap dynamic_paths;
/// Store and use pointers to ColumnDynamic to avoid virtual calls.
/// With hundreds of dynamic paths these virtual calls are noticeable.
PathToDynamicColumnPtrMap dynamic_paths_ptrs;
/// Shared storage for all other paths and values. It's filled
/// when the number of dynamic paths reaches the limit.
/// It has type Array(Tuple(String, String)) and stores
/// an array of pairs (path, binary serialized dynamic value) for each row.
WrappedPtr shared_data;
/// Maximum number of dynamic paths. If this limit is reached, all new paths will be inserted into shared data.
/// This limit can be different for different instances of Object column. For example, we can decrease it
/// in takeDynamicStructureFromSourceColumns before merge.
size_t max_dynamic_paths;
/// Global limit on number of dynamic paths for all column instances of this Object type. It's the limit specified
/// in the type definition (for example 'JSON(max_dynamic_paths=N)'). max_dynamic_paths is always not greater than this limit.
size_t global_max_dynamic_paths;
/// Maximum number of dynamic types for each dynamic path. Used while creating Dynamic columns for new dynamic paths.
size_t max_dynamic_types;
/// Statistics on the number of non-null values for each dynamic path and for some shared data paths in the MergeTree data part.
/// Calculated during serializing of data part in MergeTree. Used to determine the set of dynamic paths for the merged part.
StatisticsPtr statistics;
};
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,275 @@
#pragma once
#include <Columns/IColumn.h>
#include <Core/Field.h>
#include <Core/Names.h>
#include <DataTypes/Serializations/SubcolumnsTree.h>
#include <Common/PODArray.h>
#include <Common/WeakHash.h>
#include <DataTypes/IDataType.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
/// Info that represents a scalar or array field in a decomposed view.
/// It allows to recreate field with different number
/// of dimensions or nullability.
struct FieldInfo
{
/// The common type of of all scalars in field.
DataTypePtr scalar_type;
/// Do we have NULL scalar in field.
bool have_nulls;
/// If true then we have scalars with different types in array and
/// we need to convert scalars to the common type.
bool need_convert;
/// Number of dimension in array. 0 if field is scalar.
size_t num_dimensions;
/// If true then this field is an array of variadic dimension field
/// and we need to normalize the dimension
bool need_fold_dimension;
};
FieldInfo getFieldInfo(const Field & field);
/** A column that represents object with dynamic set of subcolumns.
* Subcolumns are identified by paths in document and are stored in
* a trie-like structure. ColumnObjectDeprecated is not suitable for writing into tables
* and it should be converted to Tuple with fixed set of subcolumns before that.
*/
class ColumnObjectDeprecated final : public COWHelper<IColumnHelper<ColumnObjectDeprecated>, ColumnObjectDeprecated>
{
public:
/** Class that represents one subcolumn.
* It stores values in several parts of column
* and keeps current common type of all parts.
* We add a new column part with a new type, when we insert a field,
* which can't be converted to the current common type.
* After insertion of all values subcolumn should be finalized
* for writing and other operations.
*/
class Subcolumn
{
public:
Subcolumn() = default;
Subcolumn(size_t size_, bool is_nullable_);
Subcolumn(MutableColumnPtr && data_, bool is_nullable_);
size_t size() const;
size_t byteSize() const;
size_t allocatedBytes() const;
void get(size_t n, Field & res) const;
bool isFinalized() const;
const DataTypePtr & getLeastCommonType() const { return least_common_type.get(); }
const DataTypePtr & getLeastCommonTypeBase() const { return least_common_type.getBase(); }
size_t getNumberOfDimensions() const { return least_common_type.getNumberOfDimensions(); }
/// Checks the consistency of column's parts stored in @data.
void checkTypes() const;
/// Inserts a field, which scalars can be arbitrary, but number of
/// dimensions should be consistent with current common type.
void insert(Field field);
void insert(Field field, FieldInfo info);
void insertDefault();
void insertManyDefaults(size_t length);
void insertRangeFrom(const Subcolumn & src, size_t start, size_t length);
void popBack(size_t n);
Subcolumn cut(size_t start, size_t length) const;
/// Converts all column's parts to the common type and
/// creates a single column that stores all values.
void finalize();
/// Returns last inserted field.
Field getLastField() const;
FieldInfo getFieldInfo() const;
/// Recreates subcolumn with default scalar values and keeps sizes of arrays.
/// Used to create columns of type Nested with consistent array sizes.
Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const;
/// Returns single column if subcolumn in finalizes.
/// Otherwise -- undefined behaviour.
IColumn & getFinalizedColumn();
const IColumn & getFinalizedColumn() const;
const ColumnPtr & getFinalizedColumnPtr() const;
const std::vector<WrappedPtr> & getData() const { return data; }
size_t getNumberOfDefaultsInPrefix() const { return num_of_defaults_in_prefix; }
friend class ColumnObjectDeprecated;
private:
class LeastCommonType
{
public:
LeastCommonType();
explicit LeastCommonType(DataTypePtr type_);
const DataTypePtr & get() const { return type; }
const DataTypePtr & getBase() const { return base_type; }
size_t getNumberOfDimensions() const { return num_dimensions; }
private:
DataTypePtr type;
DataTypePtr base_type;
size_t num_dimensions = 0;
};
void addNewColumnPart(DataTypePtr type);
/// Current least common type of all values inserted to this subcolumn.
LeastCommonType least_common_type;
/// If true then common type type of subcolumn is Nullable
/// and default values are NULLs.
bool is_nullable = false;
/// Parts of column. Parts should be in increasing order in terms of subtypes/supertypes.
/// That means that the least common type for i-th prefix is the type of i-th part
/// and it's the supertype for all type of column from 0 to i-1.
std::vector<WrappedPtr> data;
/// Until we insert any non-default field we don't know further
/// least common type and we count number of defaults in prefix,
/// which will be converted to the default type of final common type.
size_t num_of_defaults_in_prefix = 0;
size_t num_rows = 0;
};
using Subcolumns = SubcolumnsTree<Subcolumn>;
private:
/// If true then all subcolumns are nullable.
const bool is_nullable;
Subcolumns subcolumns;
size_t num_rows;
public:
static constexpr auto COLUMN_NAME_DUMMY = "_dummy";
explicit ColumnObjectDeprecated(bool is_nullable_);
ColumnObjectDeprecated(Subcolumns && subcolumns_, bool is_nullable_);
/// Checks that all subcolumns have consistent sizes.
void checkConsistency() const;
bool hasSubcolumn(const PathInData & key) const;
const Subcolumn & getSubcolumn(const PathInData & key) const;
Subcolumn & getSubcolumn(const PathInData & key);
void incrementNumRows() { ++num_rows; }
/// Adds a subcolumn from existing IColumn.
void addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn);
/// Adds a subcolumn of specific size with default values.
void addSubcolumn(const PathInData & key, size_t new_size);
/// Adds a subcolumn of type Nested of specific size with default values.
/// It cares about consistency of sizes of Nested arrays.
void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size);
/// Finds a subcolumn from the same Nested type as @entry and inserts
/// an array with default values with consistent sizes as in Nested type.
bool tryInsertDefaultFromNested(const Subcolumns::NodePtr & entry) const;
bool tryInsertManyDefaultsFromNested(const Subcolumns::NodePtr & entry) const;
const Subcolumns & getSubcolumns() const { return subcolumns; }
Subcolumns & getSubcolumns() { return subcolumns; }
PathsInData getKeys() const;
/// Part of interface
const char * getFamilyName() const override { return "Object"; }
TypeIndex getDataType() const override { return TypeIndex::ObjectDeprecated; }
size_t size() const override;
size_t byteSize() const override;
size_t allocatedBytes() const override;
void forEachSubcolumn(MutableColumnCallback callback) override;
void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override;
void insert(const Field & field) override;
bool tryInsert(const Field & field) override;
void insertDefault() override;
#if !defined(DEBUG_OR_SANITIZER_BUILD)
void insertFrom(const IColumn & src, size_t n) override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
#else
void doInsertFrom(const IColumn & src, size_t n) override;
void doInsertRangeFrom(const IColumn & src, size_t start, size_t length) override;
#endif
void popBack(size_t length) override;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr filter(const Filter & filter, ssize_t result_size_hint) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
ColumnPtr replicate(const Offsets & offsets) const override;
MutableColumnPtr cloneResized(size_t new_size) const override;
/// Finalizes all subcolumns.
void finalize() override;
bool isFinalized() const override;
/// Order of rows in ColumnObjectDeprecated is undefined.
void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation & res) const override;
void updatePermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &, EqualRanges &) const override {}
#if !defined(DEBUG_OR_SANITIZER_BUILD)
int compareAt(size_t, size_t, const IColumn &, int) const override { return 0; }
#else
int doCompareAt(size_t, size_t, const IColumn &, int) const override { return 0; }
#endif
void getExtremes(Field & min, Field & max) const override;
/// All other methods throw exception.
StringRef getDataAt(size_t) const override { throwMustBeConcrete(); }
bool isDefaultAt(size_t) const override { throwMustBeConcrete(); }
void insertData(const char *, size_t) override { throwMustBeConcrete(); }
StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeConcrete(); }
char * serializeValueIntoMemory(size_t, char *) const override { throwMustBeConcrete(); }
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeConcrete(); }
const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); }
WeakHash32 getWeakHash32() const override { throwMustBeConcrete(); }
void updateHashFast(SipHash &) const override;
void expand(const Filter &, bool) override { throwMustBeConcrete(); }
bool hasEqualValues() const override { throwMustBeConcrete(); }
size_t byteSizeAt(size_t) const override { throwMustBeConcrete(); }
double getRatioOfDefaultRows(double) const override { throwMustBeConcrete(); }
UInt64 getNumberOfDefaultRows() const override { throwMustBeConcrete(); }
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeConcrete(); }
private:
[[noreturn]] static void throwMustBeConcrete()
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ColumnObjectDeprecated must be converted to ColumnTuple before use");
}
template <typename Func>
MutableColumnPtr applyForSubcolumns(Func && func) const;
/// It's used to get shared sized of Nested to insert correct default values.
const Subcolumns::Node * getLeafOfTheSameNested(const Subcolumns::NodePtr & entry) const;
};
}

View File

@ -557,6 +557,11 @@ void ColumnString::reserve(size_t n)
offsets.reserve_exact(n);
}
size_t ColumnString::capacity() const
{
return offsets.capacity();
}
void ColumnString::prepareForSquashing(const Columns & source_columns)
{
size_t new_size = size();

View File

@ -283,6 +283,7 @@ public:
ColumnPtr compress() const override;
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override;

View File

@ -595,6 +595,14 @@ void ColumnTuple::reserve(size_t n)
getColumn(i).reserve(n);
}
size_t ColumnTuple::capacity() const
{
if (columns.empty())
return size();
return getColumn(0).capacity();
}
void ColumnTuple::prepareForSquashing(const Columns & source_columns)
{
const size_t tuple_size = columns.size();

View File

@ -110,6 +110,7 @@ public:
void updatePermutationWithCollation(const Collator & collator, IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges& equal_ranges) const override;
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override;
void shrinkToFit() override;
void ensureOwnership() override;

View File

@ -1277,6 +1277,11 @@ void ColumnVariant::prepareForSquashing(const Columns & source_columns)
}
}
size_t ColumnVariant::capacity() const
{
return local_discriminators->capacity();
}
void ColumnVariant::ensureOwnership()
{
const size_t num_variants = variants.size();

View File

@ -241,6 +241,7 @@ public:
size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override;
void reserve(size_t n) override;
size_t capacity() const override;
void prepareForSquashing(const Columns & source_columns) override;
void ensureOwnership() override;
size_t byteSize() const override;

View File

@ -180,6 +180,11 @@ public:
data.reserve_exact(n);
}
size_t capacity() const override
{
return data.capacity();
}
void shrinkToFit() override
{
data.shrink_to_fit();

View File

@ -11,12 +11,13 @@
#include <Columns/ColumnLowCardinality.h>
#include <Columns/ColumnMap.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnObjectDeprecated.h>
#include <Columns/ColumnSparse.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnVariant.h>
#include <Columns/ColumnDynamic.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnVector.h>
#include <Core/Field.h>
#include <DataTypes/Serializations/SerializationInfo.h>
@ -466,12 +467,13 @@ template class IColumnHelper<ColumnArray, IColumn>;
template class IColumnHelper<ColumnTuple, IColumn>;
template class IColumnHelper<ColumnMap, IColumn>;
template class IColumnHelper<ColumnSparse, IColumn>;
template class IColumnHelper<ColumnObject, IColumn>;
template class IColumnHelper<ColumnObjectDeprecated, IColumn>;
template class IColumnHelper<ColumnAggregateFunction, IColumn>;
template class IColumnHelper<ColumnFunction, IColumn>;
template class IColumnHelper<ColumnCompressed, IColumn>;
template class IColumnHelper<ColumnVariant, IColumn>;
template class IColumnHelper<ColumnDynamic, IColumn>;
template class IColumnHelper<ColumnObject, IColumn>;
template class IColumnHelper<IColumnDummy, IColumn>;

View File

@ -475,6 +475,9 @@ public:
/// It affects performance only (not correctness).
virtual void reserve(size_t /*n*/) {}
/// Returns the number of elements allocated in reserve.
virtual size_t capacity() const { return size(); }
/// Reserve memory before squashing all specified source columns into this column.
virtual void prepareForSquashing(const std::vector<Ptr> & source_columns)
{

View File

@ -0,0 +1,351 @@
#include <Columns/ColumnString.h>
#include <Columns/ColumnObject.h>
#include <DataTypes/DataTypeFactory.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/WriteBufferFromString.h>
#include <Common/Arena.h>
#include <gtest/gtest.h>
using namespace DB;
TEST(ColumnObject, CreateEmpty)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=20, a.b UInt32, a.c Array(String))");
auto col = type->createColumn();
const auto & col_object = assert_cast<const ColumnObject &>(*col);
const auto & typed_paths = col_object.getTypedPaths();
ASSERT_TRUE(typed_paths.contains("a.b"));
ASSERT_EQ(typed_paths.at("a.b")->getName(), "UInt32");
ASSERT_TRUE(typed_paths.contains("a.c"));
ASSERT_EQ(typed_paths.at("a.c")->getName(), "Array(String)");
ASSERT_TRUE(col_object.getDynamicPaths().empty());
ASSERT_TRUE(col_object.getSharedDataOffsets().empty());
ASSERT_TRUE(col_object.getSharedDataPathsAndValues().first->empty());
ASSERT_TRUE(col_object.getSharedDataPathsAndValues().second->empty());
ASSERT_EQ(col_object.getMaxDynamicTypes(), 10);
ASSERT_EQ(col_object.getMaxDynamicPaths(), 20);
}
TEST(ColumnObject, GetName)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=20, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
ASSERT_EQ(col->getName(), "Object(max_dynamic_paths=20, max_dynamic_types=10, a.b Array(String), b.d UInt32)");
}
Field deserializeFieldFromSharedData(ColumnString * values, size_t n)
{
auto data = values->getDataAt(n);
ReadBufferFromMemory buf(data.data, data.size);
Field res;
std::make_shared<SerializationDynamic>()->deserializeBinary(res, buf, FormatSettings());
return res;
}
TEST(ColumnObject, InsertField)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
const auto & typed_paths = col_object.getTypedPaths();
const auto & dynamic_paths = col_object.getDynamicPaths();
const auto & shared_data_nested_column = col_object.getSharedDataNestedColumn();
const auto & shared_data_offsets = col_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = col_object.getSharedDataPathsAndValues();
Object empty_object;
col_object.insert(empty_object);
ASSERT_EQ(col_object[0], (Object{{"a.b", Array{}}, {"b.d", Field(0u)}}));
ASSERT_EQ(typed_paths.at("a.b")->size(), 1);
ASSERT_TRUE(typed_paths.at("a.b")->isDefaultAt(0));
ASSERT_EQ(typed_paths.at("b.d")->size(), 1);
ASSERT_TRUE(typed_paths.at("b.d")->isDefaultAt(0));
ASSERT_TRUE(dynamic_paths.empty());
ASSERT_EQ(shared_data_nested_column.size(), 1);
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(0));
Object object1 = {{"a.b", Array{String("Hello"), String("World")}}, {"a.c", Field(42)}};
col_object.insert(object1);
ASSERT_EQ(col_object[1], (Object{{"a.b", Array{String("Hello"), String("World")}}, {"b.d", Field(0u)}, {"a.c", Field(42)}}));
ASSERT_EQ(typed_paths.at("a.b")->size(), 2);
ASSERT_EQ((*typed_paths.at("a.b"))[1], (Array{String("Hello"), String("World")}));
ASSERT_EQ(typed_paths.at("b.d")->size(), 2);
ASSERT_TRUE(typed_paths.at("b.d")->isDefaultAt(1));
ASSERT_EQ(dynamic_paths.size(), 1);
ASSERT_TRUE(dynamic_paths.contains("a.c"));
ASSERT_EQ(dynamic_paths.at("a.c")->size(), 2);
ASSERT_TRUE(dynamic_paths.at("a.c")->isDefaultAt(0));
ASSERT_EQ((*dynamic_paths.at("a.c"))[1], Field(42));
ASSERT_EQ(shared_data_nested_column.size(), 2);
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(1));
Object object2 = {{"b.d", Field(142u)}, {"a.c", Field(43)}, {"a.d", Field("str")}, {"a.e", Field(242)}, {"a.f", Array{Field(42), Field(43)}}};
col_object.insert(object2);
ASSERT_EQ(col_object[2], (Object{{"a.b", Array{}}, {"b.d", Field(142u)}, {"a.c", Field(43)}, {"a.d", Field("str")}, {"a.e", Field(242)}, {"a.f", Array{Field(42), Field(43)}}}));
ASSERT_EQ(typed_paths.at("a.b")->size(), 3);
ASSERT_TRUE(typed_paths.at("a.b")->isDefaultAt(2));
ASSERT_EQ(typed_paths.at("b.d")->size(), 3);
ASSERT_EQ((*typed_paths.at("b.d"))[2], Field(142u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_TRUE(dynamic_paths.contains("a.c"));
ASSERT_EQ(dynamic_paths.at("a.c")->size(), 3);
ASSERT_EQ((*dynamic_paths.at("a.c"))[2], Field(43));
ASSERT_TRUE(dynamic_paths.contains("a.d"));
ASSERT_EQ(dynamic_paths.at("a.d")->size(), 3);
ASSERT_EQ((*dynamic_paths.at("a.d"))[2], Field("str"));
ASSERT_EQ(shared_data_nested_column.size(), 3);
ASSERT_EQ(shared_data_offsets[2] - shared_data_offsets[1], 2);
ASSERT_EQ((*shared_data_paths)[0], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 0), Field(242));
ASSERT_EQ((*shared_data_paths)[1], "a.f");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 1), (Array({Field(42), Field(43)})));
Object object3 = {{"b.a", Field("Str")}, {"b.b", Field(2)}, {"b.c", Field(Tuple{Field(42), Field("Str")})}};
col_object.insert(object3);
ASSERT_EQ(col_object[3], (Object{{"a.b", Array{}}, {"b.d", Field(0u)}, {"b.a", Field("Str")}, {"b.b", Field(2)}, {"b.c", Field(Tuple{Field(42), Field("Str")})}}));
ASSERT_EQ(typed_paths.at("a.b")->size(), 4);
ASSERT_TRUE(typed_paths.at("a.b")->isDefaultAt(3));
ASSERT_EQ(typed_paths.at("b.d")->size(), 4);
ASSERT_TRUE(typed_paths.at("b.d")->isDefaultAt(3));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ(dynamic_paths.at("a.c")->size(), 4);
ASSERT_TRUE(dynamic_paths.at("a.c")->isDefaultAt(3));
ASSERT_EQ(dynamic_paths.at("a.d")->size(), 4);
ASSERT_TRUE(dynamic_paths.at("a.d")->isDefaultAt(3));
ASSERT_EQ(shared_data_nested_column.size(), 4);
ASSERT_EQ(shared_data_offsets[3] - shared_data_offsets[2], 3);
ASSERT_EQ((*shared_data_paths)[2], "b.a");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 2), Field("Str"));
ASSERT_EQ((*shared_data_paths)[3], "b.b");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 3), Field(2));
ASSERT_EQ((*shared_data_paths)[4], "b.c");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 4), Field(Tuple{Field(42), Field("Str")}));
Object object4 = {{"c.c", Field(Null())}, {"c.d", Field(Null())}};
col_object.insert(object4);
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(4));
}
TEST(ColumnObject, InsertFrom)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"a.a", Field(42)}});
const auto & typed_paths = col_object.getTypedPaths();
const auto & dynamic_paths = col_object.getDynamicPaths();
const auto & shared_data_nested_column = col_object.getSharedDataNestedColumn();
const auto & shared_data_offsets = col_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = col_object.getSharedDataPathsAndValues();
auto src_col1 = type->createColumn();
auto & src_col_object1 = assert_cast<ColumnObject &>(*src_col1);
src_col_object1.insert(Object{{"b.d", Field(43u)}, {"a.c", Field("Str1")}});
col_object.insertFrom(src_col_object1, 0);
ASSERT_EQ((*typed_paths.at("a.b"))[1], Field(Array{}));
ASSERT_EQ((*typed_paths.at("b.d"))[1], Field(43u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[1], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[1], Field("Str1"));
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(1));
auto src_col2 = type->createColumn();
auto & src_col_object2 = assert_cast<ColumnObject &>(*src_col2);
src_col_object2.insert(Object{{"a.b", Array{"Str4", "Str5"}}, {"b.d", Field(44u)}, {"a.d", Field("Str2")}, {"a.e", Field("Str3")}});
col_object.insertFrom(src_col_object2, 0);
ASSERT_EQ((*typed_paths.at("a.b"))[2], Field(Array{"Str4", "Str5"}));
ASSERT_EQ((*typed_paths.at("b.d"))[2], Field(44u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[2], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[2], Field(Null()));
ASSERT_EQ(shared_data_offsets[2] - shared_data_offsets[1], 2);
ASSERT_EQ((*shared_data_paths)[0], "a.d");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 0), Field("Str2"));
ASSERT_EQ((*shared_data_paths)[1], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 1), Field("Str3"));
auto src_col3 = type->createColumn();
auto & src_col_object3 = assert_cast<ColumnObject &>(*src_col3);
src_col_object3.insert(Object{{"a.h", Field("Str6")}, {"h.h", Field("Str7")}});
src_col_object3.insert(Object{{"a.a", Field("Str10")}, {"a.c", Field(45u)}, {"a.h", Field("Str6")}, {"h.h", Field("Str7")}, {"a.f", Field("Str8")}, {"a.g", Field("Str9")}, {"a.i", Field("Str11")}, {"a.u", Field(Null())}});
col_object.insertFrom(src_col_object3, 1);
ASSERT_EQ((*typed_paths.at("a.b"))[3], Field(Array{}));
ASSERT_EQ((*typed_paths.at("b.d"))[3], Field(0u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[3], Field("Str10"));
ASSERT_EQ((*dynamic_paths.at("a.c"))[3], Field(45u));
ASSERT_EQ(shared_data_offsets[3] - shared_data_offsets[2], 5);
ASSERT_EQ((*shared_data_paths)[2], "a.f");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 2), Field("Str8"));
ASSERT_EQ((*shared_data_paths)[3], "a.g");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 3), Field("Str9"));
ASSERT_EQ((*shared_data_paths)[4], "a.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 4), Field("Str6"));
ASSERT_EQ((*shared_data_paths)[5], "a.i");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 5), Field("Str11"));
ASSERT_EQ((*shared_data_paths)[6], "h.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 6), Field("Str7"));
}
TEST(ColumnObject, InsertRangeFrom)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"a.a", Field(42)}});
const auto & typed_paths = col_object.getTypedPaths();
const auto & dynamic_paths = col_object.getDynamicPaths();
const auto & shared_data_nested_column = col_object.getSharedDataNestedColumn();
const auto & shared_data_offsets = col_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = col_object.getSharedDataPathsAndValues();
auto src_col1 = type->createColumn();
auto & src_col_object1 = assert_cast<ColumnObject &>(*src_col1);
src_col_object1.insert(Object{{"b.d", Field(43u)}, {"a.c", Field("Str1")}});
src_col_object1.insert(Object{{"a.b", Field(Array{"Str1", "Str2"})}, {"a.a", Field("Str1")}});
src_col_object1.insert(Object{{"b.d", Field(45u)}, {"a.c", Field("Str2")}});
col_object.insertRangeFrom(src_col_object1, 0, 3);
ASSERT_EQ((*typed_paths.at("a.b"))[1], Field(Array{}));
ASSERT_EQ((*typed_paths.at("a.b"))[2], Field(Array{"Str1", "Str2"}));
ASSERT_EQ((*typed_paths.at("a.b"))[3], Field(Array{}));
ASSERT_EQ((*typed_paths.at("b.d"))[1], Field(43u));
ASSERT_EQ((*typed_paths.at("b.d"))[2], Field(0u));
ASSERT_EQ((*typed_paths.at("b.d"))[3], Field(45u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[1], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.a"))[2], Field("Str1"));
ASSERT_EQ((*dynamic_paths.at("a.a"))[3], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[1], Field("Str1"));
ASSERT_EQ((*dynamic_paths.at("a.c"))[2], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[3], Field("Str2"));
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(1));
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(2));
ASSERT_TRUE(shared_data_nested_column.isDefaultAt(3));
auto src_col2 = type->createColumn();
auto & src_col_object2 = assert_cast<ColumnObject &>(*src_col2);
src_col_object2.insert(Object{{"a.b", Array{"Str4", "Str5"}}, {"a.d", Field("Str2")}, {"a.e", Field("Str3")}});
src_col_object2.insert(Object{{"b.d", Field(44u)}, {"a.d", Field("Str22")}, {"a.e", Field("Str33")}});
src_col_object2.insert(Object{{"a.b", Array{"Str44", "Str55"}}, {"a.d", Field("Str222")}, {"a.e", Field("Str333")}});
col_object.insertRangeFrom(src_col_object2, 0, 3);
ASSERT_EQ((*typed_paths.at("a.b"))[4], Field(Array{"Str4", "Str5"}));
ASSERT_EQ((*typed_paths.at("a.b"))[5], Field(Array{}));
ASSERT_EQ((*typed_paths.at("a.b"))[6], Field(Array{"Str44", "Str55"}));
ASSERT_EQ((*typed_paths.at("b.d"))[4], Field(0u));
ASSERT_EQ((*typed_paths.at("b.d"))[5], Field(44u));
ASSERT_EQ((*typed_paths.at("b.d"))[6], Field(0u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[4], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.a"))[5], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.a"))[6], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[4], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[5], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[6], Field(Null()));
ASSERT_EQ(shared_data_offsets[4] - shared_data_offsets[3], 2);
ASSERT_EQ((*shared_data_paths)[0], "a.d");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 0), Field("Str2"));
ASSERT_EQ((*shared_data_paths)[1], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 1), Field("Str3"));
ASSERT_EQ(shared_data_offsets[5] - shared_data_offsets[4], 2);
ASSERT_EQ((*shared_data_paths)[2], "a.d");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 2), Field("Str22"));
ASSERT_EQ((*shared_data_paths)[3], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 3), Field("Str33"));
ASSERT_EQ(shared_data_offsets[6] - shared_data_offsets[5], 2);
ASSERT_EQ((*shared_data_paths)[4], "a.d");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 4), Field("Str222"));
ASSERT_EQ((*shared_data_paths)[5], "a.e");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 5), Field("Str333"));
auto src_col3 = type->createColumn();
auto & src_col_object3 = assert_cast<ColumnObject &>(*src_col3);
src_col_object3.insert(Object{{"a.h", Field("Str6")}, {"h.h", Field("Str7")}});
src_col_object3.insert(Object{{"a.h", Field("Str6")}, {"h.h", Field("Str7")}, {"a.f", Field("Str8")}, {"a.g", Field("Str9")}, {"a.i", Field("Str11")}});
src_col_object3.insert(Object{{"a.a", Field("Str10")}});
src_col_object3.insert(Object{{"a.h", Field("Str6")}, {"a.c", Field(45u)}, {"h.h", Field("Str7")}, {"a.i", Field("Str11")}});
col_object.insertRangeFrom(src_col_object3, 1, 3);
ASSERT_EQ((*typed_paths.at("a.b"))[7], Field(Array{}));
ASSERT_EQ((*typed_paths.at("a.b"))[8], Field(Array{}));
ASSERT_EQ((*typed_paths.at("a.b"))[9], Field(Array{}));
ASSERT_EQ((*typed_paths.at("b.d"))[7], Field(0u));
ASSERT_EQ((*typed_paths.at("b.d"))[8], Field(0u));
ASSERT_EQ((*typed_paths.at("b.d"))[9], Field(0u));
ASSERT_EQ(dynamic_paths.size(), 2);
ASSERT_EQ((*dynamic_paths.at("a.a"))[7], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.a"))[8], Field("Str10"));
ASSERT_EQ((*dynamic_paths.at("a.a"))[9], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[7], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[8], Field(Null()));
ASSERT_EQ((*dynamic_paths.at("a.c"))[9], Field(45u));
ASSERT_EQ(shared_data_offsets[7] - shared_data_offsets[6], 5);
ASSERT_EQ((*shared_data_paths)[6], "a.f");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 6), Field("Str8"));
ASSERT_EQ((*shared_data_paths)[7], "a.g");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 7), Field("Str9"));
ASSERT_EQ((*shared_data_paths)[8], "a.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 8), Field("Str6"));
ASSERT_EQ((*shared_data_paths)[9], "a.i");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 9), Field("Str11"));
ASSERT_EQ((*shared_data_paths)[10], "h.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 10), Field("Str7"));
ASSERT_EQ(shared_data_offsets[8] - shared_data_offsets[7], 0);
ASSERT_EQ(shared_data_offsets[9] - shared_data_offsets[8], 3);
ASSERT_EQ((*shared_data_paths)[11], "a.h");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 11), Field("Str6"));
ASSERT_EQ((*shared_data_paths)[12], "a.i");
ASSERT_EQ(deserializeFieldFromSharedData(shared_data_values, 12), Field("Str11"));
}
TEST(ColumnObject, SerializeDeserializerFromArena)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"b.d", Field(42u)}, {"a.b", Array{"Str1", "Str2"}}, {"a.a", Tuple{"Str3", 441u}}, {"a.c", Field("Str4")}, {"a.d", Array{Field(45), Field(46)}}, {"a.e", Field(47)}});
col_object.insert(Object{{"b.a", Field(48)}, {"b.b", Array{Field(49), Field(50)}}});
col_object.insert(Object{{"b.d", Field(442u)}, {"a.b", Array{"Str11", "Str22"}}, {"a.a", Tuple{"Str33", 444u}}, {"a.c", Field("Str44")}, {"a.d", Array{Field(445), Field(446)}}, {"a.e", Field(447)}});
Arena arena;
const char * pos = nullptr;
auto ref1 = col_object.serializeValueIntoArena(0, arena, pos);
col_object.serializeValueIntoArena(1, arena, pos);
col_object.serializeValueIntoArena(2, arena, pos);
auto col2 = type->createColumn();
auto & col_object2 = assert_cast<ColumnObject &>(*col);
pos = col_object2.deserializeAndInsertFromArena(ref1.data);
pos = col_object2.deserializeAndInsertFromArena(pos);
col_object2.deserializeAndInsertFromArena(pos);
ASSERT_EQ(col_object2[0], (Object{{"b.d", Field(42u)}, {"a.b", Array{"Str1", "Str2"}}, {"a.a", Tuple{"Str3", 441u}}, {"a.c", Field("Str4")}, {"a.d", Array{Field(45), Field(46)}}, {"a.e", Field(47)}}));
ASSERT_EQ(col_object2[1], (Object{{"b.d", Field{0u}}, {"a.b", Array{}}, {"b.a", Field(48)}, {"b.b", Array{Field(49), Field(50)}}}));
ASSERT_EQ(col_object2[2], (Object{{"b.d", Field(442u)}, {"a.b", Array{"Str11", "Str22"}}, {"a.a", Tuple{"Str33", 444u}}, {"a.c", Field("Str44")}, {"a.d", Array{Field(445), Field(446)}}, {"a.e", Field(447)}}));
}
TEST(ColumnObject, SkipSerializedInArena)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, b.d UInt32, a.b Array(String))");
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"b.d", Field(42u)}, {"a.b", Array{"Str1", "Str2"}}, {"a.a", Tuple{"Str3", 441u}}, {"a.c", Field("Str4")}, {"a.d", Array{Field(45), Field(46)}}, {"a.e", Field(47)}});
col_object.insert(Object{{"b.a", Field(48)}, {"b.b", Array{Field(49), Field(50)}}});
col_object.insert(Object{{"b.d", Field(442u)}, {"a.b", Array{"Str11", "Str22"}}, {"a.a", Tuple{"Str33", 444u}}, {"a.c", Field("Str44")}, {"a.d", Array{Field(445), Field(446)}}, {"a.e", Field(447)}});
Arena arena;
const char * pos = nullptr;
auto ref1 = col_object.serializeValueIntoArena(0, arena, pos);
col_object.serializeValueIntoArena(1, arena, pos);
auto ref3 = col_object.serializeValueIntoArena(2, arena, pos);
const char * end = ref3.data + ref3.size;
auto col2 = type->createColumn();
pos = col2->skipSerializedInArena(ref1.data);
pos = col2->skipSerializedInArena(pos);
pos = col2->skipSerializedInArena(pos);
ASSERT_EQ(pos, end);
}

View File

@ -608,6 +608,7 @@
M(727, UNEXPECTED_TABLE_ENGINE) \
M(728, UNEXPECTED_DATA_TYPE) \
M(729, ILLEGAL_TIME_SERIES_TAGS) \
M(730, REFRESH_FAILED) \
\
M(900, DISTRIBUTED_CACHE_ERROR) \
M(901, CANNOT_USE_DISTRIBUTED_CACHE) \

View File

@ -110,7 +110,7 @@ namespace
errno = saved_errno;
}
[[maybe_unused]] constexpr UInt32 TIMER_PRECISION = 1e9;
[[maybe_unused]] constexpr UInt64 TIMER_PRECISION = 1e9;
}
namespace ErrorCodes
@ -167,18 +167,18 @@ void Timer::createIfNecessary(UInt64 thread_id, int clock_type, int pause_signal
}
}
void Timer::set(UInt32 period)
void Timer::set(UInt64 period)
{
/// Too high frequency can introduce infinite busy loop of signal handlers. We will limit maximum frequency (with 1000 signals per second).
period = std::max<UInt32>(period, 1000000);
period = std::max<UInt64>(period, 1000000);
/// Randomize offset as uniform random value from 0 to period - 1.
/// It will allow to sample short queries even if timer period is large.
/// (For example, with period of 1 second, query with 50 ms duration will be sampled with 1 / 20 probability).
/// It also helps to avoid interference (moire).
UInt32 period_rand = std::uniform_int_distribution<UInt32>(0, period)(thread_local_rng);
UInt64 period_rand = std::uniform_int_distribution<UInt64>(0, period)(thread_local_rng);
struct timespec interval{.tv_sec = period / TIMER_PRECISION, .tv_nsec = period % TIMER_PRECISION};
struct timespec offset{.tv_sec = period_rand / TIMER_PRECISION, .tv_nsec = period_rand % TIMER_PRECISION};
struct timespec interval{.tv_sec = time_t(period / TIMER_PRECISION), .tv_nsec = int64_t(period % TIMER_PRECISION)};
struct timespec offset{.tv_sec = time_t(period_rand / TIMER_PRECISION), .tv_nsec = int64_t(period_rand % TIMER_PRECISION)};
struct itimerspec timer_spec = {.it_interval = interval, .it_value = offset};
if (timer_settime(*timer_id, 0, &timer_spec, nullptr))
@ -229,7 +229,7 @@ void Timer::cleanup()
template <typename ProfilerImpl>
QueryProfilerBase<ProfilerImpl>::QueryProfilerBase(
[[maybe_unused]] UInt64 thread_id, [[maybe_unused]] int clock_type, [[maybe_unused]] UInt32 period, [[maybe_unused]] int pause_signal_)
[[maybe_unused]] UInt64 thread_id, [[maybe_unused]] int clock_type, [[maybe_unused]] UInt64 period, [[maybe_unused]] int pause_signal_)
: log(getLogger("QueryProfiler")), pause_signal(pause_signal_)
{
#if defined(SANITIZER)
@ -270,7 +270,7 @@ QueryProfilerBase<ProfilerImpl>::QueryProfilerBase(
template <typename ProfilerImpl>
void QueryProfilerBase<ProfilerImpl>::setPeriod([[maybe_unused]] UInt32 period_)
void QueryProfilerBase<ProfilerImpl>::setPeriod([[maybe_unused]] UInt64 period_)
{
#if defined(SANITIZER)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "QueryProfiler disabled because they cannot work under sanitizers");
@ -307,7 +307,7 @@ void QueryProfilerBase<ProfilerImpl>::cleanup()
template class QueryProfilerBase<QueryProfilerReal>;
template class QueryProfilerBase<QueryProfilerCPU>;
QueryProfilerReal::QueryProfilerReal(UInt64 thread_id, UInt32 period)
QueryProfilerReal::QueryProfilerReal(UInt64 thread_id, UInt64 period)
: QueryProfilerBase(thread_id, CLOCK_MONOTONIC, period, SIGUSR1)
{}
@ -320,7 +320,7 @@ void QueryProfilerReal::signalHandler(int sig, siginfo_t * info, void * context)
writeTraceInfo(TraceType::Real, sig, info, context);
}
QueryProfilerCPU::QueryProfilerCPU(UInt64 thread_id, UInt32 period)
QueryProfilerCPU::QueryProfilerCPU(UInt64 thread_id, UInt64 period)
: QueryProfilerBase(thread_id, CLOCK_THREAD_CPUTIME_ID, period, SIGUSR2)
{}

View File

@ -40,7 +40,7 @@ public:
~Timer();
void createIfNecessary(UInt64 thread_id, int clock_type, int pause_signal);
void set(UInt32 period);
void set(UInt64 period);
void stop();
void cleanup();
@ -54,10 +54,10 @@ template <typename ProfilerImpl>
class QueryProfilerBase
{
public:
QueryProfilerBase(UInt64 thread_id, int clock_type, UInt32 period, int pause_signal_);
QueryProfilerBase(UInt64 thread_id, int clock_type, UInt64 period, int pause_signal_);
~QueryProfilerBase();
void setPeriod(UInt32 period_);
void setPeriod(UInt64 period_);
private:
void cleanup();
@ -76,7 +76,7 @@ private:
class QueryProfilerReal : public QueryProfilerBase<QueryProfilerReal>
{
public:
QueryProfilerReal(UInt64 thread_id, UInt32 period); /// NOLINT
QueryProfilerReal(UInt64 thread_id, UInt64 period); /// NOLINT
static void signalHandler(int sig, siginfo_t * info, void * context);
};
@ -85,7 +85,7 @@ public:
class QueryProfilerCPU : public QueryProfilerBase<QueryProfilerCPU>
{
public:
QueryProfilerCPU(UInt64 thread_id, UInt32 period); /// NOLINT
QueryProfilerCPU(UInt64 thread_id, UInt64 period); /// NOLINT
static void signalHandler(int sig, siginfo_t * info, void * context);
};

View File

@ -248,8 +248,31 @@ void StackTrace::forEachFrame(
auto dwarf_it = dwarfs.try_emplace(object->name, object->elf).first;
DB::Dwarf::LocationInfo location;
if (dwarf_it->second.findAddress(
uintptr_t(current_frame.physical_addr), location, mode, inline_frames))
uintptr_t adjusted_addr = uintptr_t(current_frame.physical_addr);
if (i > 0)
{
/// For non-innermost stack frames, the address points to the *next* instruction
/// after the `call` instruction. But we want the line number and inline function
/// information for the `call` instruction. So subtract 1 from the address.
/// Caveats:
/// * The `call` instruction can be longer than 1 byte, so addr-1 is in the middle
/// of the instruction. That's ok for debug info lookup: address ranges in debug
/// info cover the whole instruction.
/// * If the stack trace unwound out of a signal handler, the stack frame just
/// outside the signal didn't do a function call. It was interrupted by signal.
/// There's no `call` instruction, and decrementing the address is incorrect.
/// We may get incorrect line number and inlined functions in this case.
/// Unfortunate.
/// Note that libunwind, when producing this stack trace, knows whether this
/// frame is interrupted by signal or not. We could propagate this information
/// from libunwind to here and avoid subtracting 1 in this case, but currently
/// we don't do this.
/// But we don't do the decrement for findSymbol below (because `call` is
/// ~never the last instruction of a function), so the function name should be
/// correct for both pre-signal frames and regular frames.
adjusted_addr -= 1;
}
if (dwarf_it->second.findAddress(adjusted_addr, location, mode, inline_frames))
{
current_frame.file = location.file.toString();
current_frame.line = location.line;

View File

@ -0,0 +1,30 @@
#pragma once
#include <base/StringRef.h>
namespace DB
{
/// See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0919r3.html
struct StringHashForHeterogeneousLookup
{
using hash_type = std::hash<std::string_view>;
using transparent_key_equal = std::equal_to<>;
using is_transparent = void; // required to make find() work with different type than key_type
auto operator()(const std::string_view view) const
{
return hash_type()(view);
}
auto operator()(const std::string & str) const
{
return hash_type()(str);
}
auto operator()(const char * data) const
{
return hash_type()(data);
}
};
}

View File

@ -1570,7 +1570,7 @@ size_t getFailedOpIndex(Coordination::Error exception_code, const Coordination::
KeeperMultiException::KeeperMultiException(Coordination::Error exception_code, size_t failed_op_index_, const Coordination::Requests & requests_, const Coordination::Responses & responses_)
: KeeperException(exception_code, "Transaction failed: Op #{}, path", failed_op_index_),
: KeeperException(exception_code, "Transaction failed ({}): Op #{}, path", exception_code, failed_op_index_),
requests(requests_), responses(responses_), failed_op_index(failed_op_index_)
{
addMessage(getPathForFirstFailedOp());

View File

@ -44,7 +44,7 @@ namespace ErrorCodes
namespace zkutil
{
/// Preferred size of multi() command (in number of ops)
/// Preferred size of multi command (in the number of operations)
constexpr size_t MULTI_BATCH_SIZE = 100;
struct ShuffleHost

View File

@ -79,11 +79,16 @@ std::vector<String> parseRemoteDescription(
/// Look for the corresponding closing bracket
for (m = i + 1; m < r; ++m)
{
if (description[m] == '{') ++cnt;
if (description[m] == '}') --cnt;
if (description[m] == '.' && description[m-1] == '.') last_dot = m;
if (description[m] == separator) have_splitter = true;
if (cnt == 0) break;
if (description[m] == '{')
++cnt;
if (description[m] == '}')
--cnt;
if (description[m] == '.' && description[m-1] == '.')
last_dot = m;
if (description[m] == separator)
have_splitter = true;
if (cnt == 0)
break;
}
if (cnt != 0)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table function '{}': incorrect brace sequence in first argument", func_name);

View File

@ -54,7 +54,7 @@ namespace
std::filesystem::path path(snapshot_path);
std::string filename = path.stem();
Strings name_parts;
splitInto<'_'>(name_parts, filename);
splitInto<'_', '.'>(name_parts, filename);
return parse<uint64_t>(name_parts[1]);
}

View File

@ -26,12 +26,16 @@ std::optional<RaftServerConfig> RaftServerConfig::parse(std::string_view server)
if (!with_id_endpoint && !with_server_type && !with_priority)
return std::nullopt;
const std::string_view id_str = parts[0];
std::string_view id_str = parts[0];
if (!id_str.starts_with("server."))
return std::nullopt;
id_str = id_str.substr(7);
if (auto eq_pos = id_str.find('='); std::string_view::npos != eq_pos)
id_str = id_str.substr(0, eq_pos);
Int32 id;
if (!tryParse(id, std::next(id_str.begin(), 7)))
if (!tryParse(id, id_str))
return std::nullopt;
if (id <= 0)
return std::nullopt;

View File

@ -24,9 +24,7 @@ void GTIDSet::tryMerge(size_t i)
void GTIDSets::parse(String gtid_format)
{
if (gtid_format.empty())
{
return;
}
std::vector<String> gtid_sets;
boost::split(gtid_sets, gtid_format, [](char c) { return c == ','; });

View File

@ -10,20 +10,19 @@ GTEST_TEST(GTIDSetsContains, Tests)
contained1, contained2, contained3, contained4, contained5,
not_contained1, not_contained2, not_contained3, not_contained4, not_contained5, not_contained6;
gtid_set.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60");
contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60");
gtid_set.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60");
contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60");
contained2.parse("2174B383-5441-11E8-B90A-C80AA9429562:2-3:11:47-49");
contained3.parse("2174B383-5441-11E8-B90A-C80AA9429562:11");
contained4.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:2-16:47-49:60");
contained5.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:60");
contained4.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:2-16:47-49:60");
contained5.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:60");
not_contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-50, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60");
not_contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-50, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60");
not_contained2.parse("2174B383-5441-11E8-B90A-C80AA9429562:0-3:11:47-49");
not_contained3.parse("2174B383-5441-11E8-B90A-C80AA9429562:99");
not_contained4.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:2-16:46-49:60");
not_contained5.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:99");
not_contained6.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60, 00000000-0000-0000-0000-000000000000");
not_contained4.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:2-16:46-49:60");
not_contained5.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:99");
not_contained6.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60, 00000000-0000-0000-0000-000000000000");
ASSERT_TRUE(gtid_set.contains(contained1));
ASSERT_TRUE(gtid_set.contains(contained2));

View File

@ -616,6 +616,7 @@ class IColumn;
M(Bool, throw_if_deduplication_in_dependent_materialized_views_enabled_with_async_insert, true, "Throw exception on INSERT query when the setting `deduplicate_blocks_in_dependent_materialized_views` is enabled along with `async_insert`. It guarantees correctness, because these features can't work together.", 0) \
M(Bool, materialized_views_ignore_errors, false, "Allows to ignore errors for MATERIALIZED VIEW, and deliver original block to the table regardless of MVs", 0) \
M(Bool, ignore_materialized_views_with_dropped_target_table, false, "Ignore MVs with dropped target table during pushing to views", 0) \
M(Bool, allow_materialized_view_with_bad_select, true, "Allow CREATE MATERIALIZED VIEW with SELECT query that references nonexistent tables or columns. It must still be syntactically valid. Doesn't apply to refreshable MVs. Doesn't apply if the MV schema needs to be inferred from the SELECT query (i.e. if the CREATE has no column list and no TO table). Can be used for creating MV before its source table.", 0) \
M(Bool, use_compact_format_in_distributed_parts_names, true, "Changes format of directories names for distributed table insert parts.", 0) \
M(Bool, validate_polygons, true, "Throw exception if polygon is invalid in function pointInPolygon (e.g. self-tangent, self-intersecting). If the setting is false, the function will accept invalid polygons but may silently return wrong result.", 0) \
M(UInt64, max_parser_depth, DBMS_DEFAULT_MAX_PARSER_DEPTH, "Maximum parser depth (recursion depth of recursive descend parser).", 0) \
@ -878,6 +879,7 @@ class IColumn;
M(Bool, allow_get_client_http_header, false, "Allow to use the function `getClientHTTPHeader` which lets to obtain a value of an the current HTTP request's header. It is not enabled by default for security reasons, because some headers, such as `Cookie`, could contain sensitive info. Note that the `X-ClickHouse-*` and `Authentication` headers are always restricted and cannot be obtained with this function.", 0) \
M(Bool, cast_string_to_dynamic_use_inference, false, "Use types inference during String to Dynamic conversion", 0) \
M(Bool, enable_blob_storage_log, true, "Write information about blob storage operations to system.blob_storage_log table", 0) \
M(Bool, use_json_alias_for_old_object_type, false, "When enabled, JSON type alias will create old experimental Object type instead of a new JSON type", 0) \
M(Bool, allow_create_index_without_type, false, "Allow CREATE INDEX query without TYPE. Query will be ignored. Made for SQL compatibility tests.", 0) \
M(Bool, create_index_ignore_unique, false, "Ignore UNIQUE keyword in CREATE UNIQUE INDEX. Made for SQL compatibility tests.", 0) \
M(Bool, print_pretty_type_names, true, "Print pretty type names in DESCRIBE query and toTypeName() function", 0) \
@ -912,6 +914,7 @@ class IColumn;
M(Bool, allow_experimental_vector_similarity_index, false, "Allow experimental vector similarity index", 0) \
M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \
M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \
M(Bool, allow_experimental_json_type, false, "Allow JSON data type", 0) \
M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \
M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \
M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \
@ -1133,6 +1136,7 @@ class IColumn;
M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \
M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \
M(Bool, type_json_skip_duplicated_paths, false, "When enabled, during parsing JSON object into JSON type duplicated paths will be ignored and only the first one will be inserted instead of an exception", 0) \
M(UInt64, input_format_json_max_depth, 1000, "Maximum depth of a field in JSON. This is not a strict limit, it does not have to be applied precisely.", 0) \
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \

View File

@ -84,10 +84,14 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"use_hive_partitioning", false, false, "Allows to use hive partitioning for File, URL, S3, AzureBlobStorage and HDFS engines."},
{"allow_experimental_kafka_offsets_storage_in_keeper", false, false, "Allow the usage of experimental Kafka storage engine that stores the committed offsets in ClickHouse Keeper"},
{"allow_archive_path_syntax", true, true, "Added new setting to allow disabling archive path syntax."},
{"allow_materialized_view_with_bad_select", true, true, "Support (but not enable yet) stricter validation in CREATE MATERIALIZED VIEW"},
{"query_cache_tag", "", "", "New setting for labeling query cache settings."},
{"allow_experimental_time_series_table", false, false, "Added new setting to allow the TimeSeries table engine"},
{"enable_analyzer", 1, 1, "Added an alias to a setting `allow_experimental_analyzer`."},
{"optimize_functions_to_subcolumns", false, true, "Enabled settings by default"},
{"allow_experimental_json_type", false, false, "Add new experimental JSON type"},
{"use_json_alias_for_old_object_type", true, false, "Use JSON type alias to create new JSON type"},
{"type_json_skip_duplicated_paths", false, false, "Allow to skip duplicated paths during JSON parsing"},
{"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."},
{"allow_experimental_vector_similarity_index", false, false, "Added new setting to allow experimental vector similarity indexes"},
{"input_format_try_infer_datetimes_only_datetime64", true, false, "Allow to infer DateTime instead of DateTime64 in data formats"}

View File

@ -237,7 +237,7 @@ SettingFieldMaxThreads & SettingFieldMaxThreads::operator=(const Field & f)
String SettingFieldMaxThreads::toString() const
{
if (is_auto)
return "'auto(" + ::DB::toString(value) + ")'";
return "auto(" + ::DB::toString(value) + ")";
else
return ::DB::toString(value);
}

View File

@ -153,7 +153,7 @@ struct SettingFieldMaxThreads
operator UInt64() const { return value; } /// NOLINT
explicit operator Field() const { return value; }
/// Writes "auto(<number>)" instead of simple "<number>" if `is_auto==true`.
/// Writes "auto(<number>)" instead of simple "<number>" if `is_auto == true`.
String toString() const;
void parseFromString(const String & str);

View File

@ -45,6 +45,7 @@ enum class TypeIndex : uint8_t
AggregateFunction,
LowCardinality,
Map,
ObjectDeprecated,
Object,
IPv4,
IPv6,

View File

@ -15,6 +15,7 @@
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTLiteral.h>
#include <base/find_symbols.h>
#include <IO/ReadBufferFromMemory.h>
namespace DB
@ -67,7 +68,11 @@ static DataTypePtr create(const ASTPtr & arguments)
if (!argument || argument->name != "equals")
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Dynamic data type argument should be in a form 'max_types=N'");
auto identifier_name = argument->arguments->children[0]->as<ASTIdentifier>()->name();
const auto * identifier = argument->arguments->children[0]->as<ASTIdentifier>();
if (!identifier)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected Dynamic type argument: {}. Expected expression 'max_types=N'", identifier->formatForErrorMessage());
auto identifier_name = identifier->name();
if (identifier_name != "max_types")
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected identifier: {}. Dynamic data type argument should be in a form 'max_types=N'", identifier_name);
@ -84,9 +89,53 @@ void registerDataTypeDynamic(DataTypeFactory & factory)
factory.registerDataType("Dynamic", create);
}
namespace
{
/// Split Dynamic subcolumn name into 2 parts: type name and subcolumn of this type.
/// We cannot simply split by '.' because type name can also contain dots. For example: Tuple(`a.b` UInt32).
/// But in all such cases this '.' will be inside back quotes. To split subcolumn name correctly
/// we search for the first '.' that is not inside back quotes.
std::pair<std::string_view, std::string_view> splitSubcolumnName(std::string_view subcolumn_name)
{
bool inside_quotes = false;
const char * pos = subcolumn_name.data();
const char * end = subcolumn_name.data() + subcolumn_name.size();
while (true)
{
pos = find_first_symbols<'`', '.', '\\'>(pos, end);
if (pos == end)
break;
if (*pos == '`')
{
inside_quotes = !inside_quotes;
++pos;
}
else if (*pos == '\\')
{
++pos;
}
else if (*pos == '.')
{
if (inside_quotes)
++pos;
else
break;
}
}
if (pos == end)
return {subcolumn_name, {}};
return {std::string_view(subcolumn_name.data(), pos), std::string_view(pos + 1, end)};
}
}
std::unique_ptr<IDataType::SubstreamData> DataTypeDynamic::getDynamicSubcolumnData(std::string_view subcolumn_name, const DB::IDataType::SubstreamData & data, bool throw_if_null) const
{
auto [type_subcolumn_name, subcolumn_nested_name] = Nested::splitName(subcolumn_name);
auto [type_subcolumn_name, subcolumn_nested_name] = splitSubcolumnName(subcolumn_name);
/// Check if requested subcolumn is a valid data type.
auto subcolumn_type = DataTypeFactory::instance().tryGet(String(type_subcolumn_name));
if (!subcolumn_type)

View File

@ -12,6 +12,9 @@ class DataTypeDynamic final : public IDataType
public:
static constexpr bool is_parametric = true;
/// Don't change this constant, it can break backward compatibility.
static constexpr size_t DEFAULT_MAX_DYNAMIC_TYPES = 32;
explicit DataTypeDynamic(size_t max_dynamic_types_ = DEFAULT_MAX_DYNAMIC_TYPES);
TypeIndex getTypeId() const override { return TypeIndex::Dynamic; }
@ -43,8 +46,6 @@ public:
size_t getMaxDynamicTypes() const { return max_dynamic_types; }
private:
static constexpr size_t DEFAULT_MAX_DYNAMIC_TYPES = 32;
SerializationPtr doGetDefaultSerialization() const override;
String doGetName() const override;

View File

@ -273,9 +273,10 @@ DataTypeFactory::DataTypeFactory()
registerDataTypeDomainSimpleAggregateFunction(*this);
registerDataTypeDomainGeo(*this);
registerDataTypeMap(*this);
registerDataTypeObject(*this);
registerDataTypeObjectDeprecated(*this);
registerDataTypeVariant(*this);
registerDataTypeDynamic(*this);
registerDataTypeJSON(*this);
}
DataTypeFactory & DataTypeFactory::instance()

View File

@ -99,8 +99,9 @@ void registerDataTypeLowCardinality(DataTypeFactory & factory);
void registerDataTypeDomainBool(DataTypeFactory & factory);
void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory);
void registerDataTypeDomainGeo(DataTypeFactory & factory);
void registerDataTypeObject(DataTypeFactory & factory);
void registerDataTypeObjectDeprecated(DataTypeFactory & factory);
void registerDataTypeVariant(DataTypeFactory & factory);
void registerDataTypeDynamic(DataTypeFactory & factory);
void registerDataTypeJSON(DataTypeFactory & factory);
}

View File

@ -1,83 +1,511 @@
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/Serializations/SerializationObject.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/Serializations/SerializationJSON.h>
#include <DataTypes/Serializations/SerializationObjectTypedPath.h>
#include <DataTypes/Serializations/SerializationObjectDynamicPath.h>
#include <DataTypes/Serializations/SerializationSubObject.h>
#include <Columns/ColumnObject.h>
#include <Parsers/IAST.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTDataType.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTObjectTypeArgument.h>
#include <Parsers/ASTNameTypePair.h>
#include <Formats/JSONExtractTree.h>
#include <Interpreters/Context.h>
#include <Core/Settings.h>
#include <IO/Operators.h>
#if USE_SIMDJSON
#include <Common/JSONParsers/SimdJSONParser.h>
#endif
#if USE_RAPIDJSON
#include <Common/JSONParsers/RapidJSONParser.h>
#endif
#include <Common/JSONParsers/DummyJSONParser.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int UNEXPECTED_AST_STRUCTURE;
extern const int BAD_ARGUMENTS;
}
DataTypeObject::DataTypeObject(const String & schema_format_, bool is_nullable_)
: schema_format(Poco::toLower(schema_format_))
, is_nullable(is_nullable_)
DataTypeObject::DataTypeObject(
const SchemaFormat & schema_format_,
std::unordered_map<String, DataTypePtr> typed_paths_,
std::unordered_set<String> paths_to_skip_,
std::vector<String> path_regexps_to_skip_,
size_t max_dynamic_paths_,
size_t max_dynamic_types_)
: schema_format(schema_format_)
, typed_paths(std::move(typed_paths_))
, paths_to_skip(std::move(paths_to_skip_))
, path_regexps_to_skip(std::move(path_regexps_to_skip_))
, max_dynamic_paths(max_dynamic_paths_)
, max_dynamic_types(max_dynamic_types_)
{
for (const auto & [typed_path, type] : typed_paths)
{
for (const auto & path_to_skip : paths_to_skip)
{
if (typed_path.starts_with(path_to_skip))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Path '{}' is specified with the data type ('{}') and matches the SKIP path prefix '{}'", typed_path, type->getName(), path_to_skip);
}
for (const auto & path_regex_to_skip : path_regexps_to_skip)
{
if (re2::RE2::FullMatch(typed_path, re2::RE2(path_regex_to_skip)))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Path '{}' is specified with the data type ('{}') and matches the SKIP REGEXP '{}'", typed_path, type->getName(), path_regex_to_skip);
}
}
}
DataTypeObject::DataTypeObject(const DB::DataTypeObject::SchemaFormat & schema_format_, size_t max_dynamic_paths_, size_t max_dynamic_types_)
: schema_format(schema_format_)
, max_dynamic_paths(max_dynamic_paths_)
, max_dynamic_types(max_dynamic_types_)
{
}
bool DataTypeObject::equals(const IDataType & rhs) const
{
if (const auto * object = typeid_cast<const DataTypeObject *>(&rhs))
return schema_format == object->schema_format && is_nullable == object->is_nullable;
{
if (typed_paths.size() != object->typed_paths.size())
return false;
for (const auto & [path, type] : typed_paths)
{
auto it = object->typed_paths.find(path);
if (it == object->typed_paths.end())
return false;
if (!type->equals(*it->second))
return false;
}
return schema_format == object->schema_format && paths_to_skip == object->paths_to_skip && path_regexps_to_skip == object->path_regexps_to_skip
&& max_dynamic_types == object->max_dynamic_types && max_dynamic_paths == object->max_dynamic_paths;
}
return false;
}
SerializationPtr DataTypeObject::doGetDefaultSerialization() const
{
return getObjectSerialization(schema_format);
std::unordered_map<String, SerializationPtr> typed_path_serializations;
typed_path_serializations.reserve(typed_paths.size());
for (const auto & [path, type] : typed_paths)
typed_path_serializations[path] = type->getDefaultSerialization();
switch (schema_format)
{
case SchemaFormat::JSON:
#ifdef USE_SIMDJSON
return std::make_shared<SerializationJSON<SimdJSONParser>>(
std::move(typed_path_serializations),
paths_to_skip,
path_regexps_to_skip,
buildJSONExtractTree<SimdJSONParser>(getPtr(), "JSON serialization"));
#elif USE_RAPIDJSON
return std::make_shared<SerializationJSON<RapidJSONParser>>(
std::move(typed_path_serializations),
paths_to_skip,
path_regexps_to_skip,
buildJSONExtractTree<RapidJSONParser>(getPtr(), "JSON serialization"));
#else
return std::make_shared<SerializationJSON<DummyJSONParser>>(
std::move(typed_path_serializations),
paths_to_skip,
path_regexps_to_skip,
buildJSONExtractTree<DummyJSONParser>(getPtr(), "JSON serialization"));
#endif
}
}
String DataTypeObject::doGetName() const
{
WriteBufferFromOwnString out;
if (is_nullable)
out << "Object(Nullable(" << quote << schema_format << "))";
out << magic_enum::enum_name(schema_format);
bool first = true;
auto write_separator = [&]()
{
if (!first)
{
out << ", ";
}
else
out << "Object(" << quote << schema_format << ")";
{
out << "(";
first = false;
}
};
if (max_dynamic_types != DataTypeDynamic::DEFAULT_MAX_DYNAMIC_TYPES)
{
write_separator();
out << "max_dynamic_types=" << max_dynamic_types;
}
if (max_dynamic_paths != DEFAULT_MAX_SEPARATELY_STORED_PATHS)
{
write_separator();
out << "max_dynamic_paths=" << max_dynamic_paths;
}
std::vector<String> sorted_typed_paths;
sorted_typed_paths.reserve(typed_paths.size());
for (const auto & [path, _] : typed_paths)
sorted_typed_paths.push_back(path);
std::sort(sorted_typed_paths.begin(), sorted_typed_paths.end());
for (const auto & path : sorted_typed_paths)
{
write_separator();
out << backQuoteIfNeed(path) << " " << typed_paths.at(path)->getName();
}
std::vector<String> sorted_skip_paths;
sorted_skip_paths.reserve(paths_to_skip.size());
for (const auto & skip_path : paths_to_skip)
sorted_skip_paths.push_back(skip_path);
std::sort(sorted_skip_paths.begin(), sorted_skip_paths.end());
for (const auto & skip_path : sorted_skip_paths)
{
write_separator();
out << "SKIP " << backQuoteIfNeed(skip_path);
}
for (const auto & skip_regexp : path_regexps_to_skip)
{
write_separator();
out << "SKIP REGEXP " << quoteString(skip_regexp);
}
if (!first)
out << ")";
return out.str();
}
static DataTypePtr create(const ASTPtr & arguments)
MutableColumnPtr DataTypeObject::createColumn() const
{
if (!arguments || arguments->children.size() != 1)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Object data type family must have one argument - name of schema format");
std::unordered_map<String, MutableColumnPtr> typed_path_columns;
typed_path_columns.reserve(typed_paths.size());
for (const auto & [path, type] : typed_paths)
typed_path_columns[path] = type->createColumn();
ASTPtr schema_argument = arguments->children[0];
bool is_nullable = false;
return ColumnObject::create(std::move(typed_path_columns), max_dynamic_paths, max_dynamic_types);
}
if (const auto * type = schema_argument->as<ASTDataType>())
namespace
{
/// It is possible to have nested JSON object inside Dynamic. For example when we have an array of JSON objects.
/// During type inference in parsing in case of creating nested JSON objects, we reduce max_dynamic_paths/max_dynamic_types by factors
/// NESTED_OBJECT_MAX_DYNAMIC_PATHS_REDUCE_FACTOR/NESTED_OBJECT_MAX_DYNAMIC_TYPES_REDUCE_FACTOR.
/// So the type name will actually be JSON(max_dynamic_paths=N, max_dynamic_types=M). But we want the user to be able to query it
/// using json.array.:`Array(JSON)`.some.path without specifying max_dynamic_paths/max_dynamic_types.
/// To support it, we do a trick - we replace JSON name in subcolumn to JSON(max_dynamic_paths=N, max_dynamic_types=M), because we know
/// the exact values of max_dynamic_paths/max_dynamic_types for it.
void replaceJSONTypeNameIfNeeded(String & type_name, size_t max_dynamic_paths, size_t max_dynamic_types)
{
auto pos = type_name.find("JSON");
while (pos != String::npos)
{
if (type->name != "Nullable" || type->arguments->children.size() != 1)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE,
"Expected 'Nullable(<schema_name>)' as parameter for type Object (function: {})", type->name);
/// Replace only if we don't already have parameters in JSON type declaration.
if (pos + 4 == type_name.size() || type_name[pos + 4] != '(')
type_name.replace(
pos,
4,
fmt::format(
"JSON(max_dynamic_paths={}, max_dynamic_types={})",
max_dynamic_paths / DataTypeObject::NESTED_OBJECT_MAX_DYNAMIC_PATHS_REDUCE_FACTOR,
max_dynamic_types / DataTypeObject::NESTED_OBJECT_MAX_DYNAMIC_TYPES_REDUCE_FACTOR));
pos = type_name.find("JSON", pos + 4);
}
}
schema_argument = type->arguments->children[0];
is_nullable = true;
/// JSON subcolumn name with Dynamic type subcolumn looks like this:
/// "json.some.path.:`Type_name`.some.subcolumn".
/// We back quoted type name during identifier parsing so we can distinguish type subcolumn and path element ":TypeName".
std::pair<String, String> splitPathAndDynamicTypeSubcolumn(std::string_view subcolumn_name, size_t max_dynamic_paths, size_t max_dynamic_types)
{
/// Try to find dynamic type subcolumn in a form .:`Type`.
auto pos = subcolumn_name.find(".:`");
if (pos == std::string_view::npos)
return {String(subcolumn_name), ""};
ReadBufferFromMemory buf(subcolumn_name.substr(pos + 2));
String dynamic_subcolumn;
/// Try to read back quoted type name.
if (!tryReadBackQuotedString(dynamic_subcolumn, buf))
return {String(subcolumn_name), ""};
replaceJSONTypeNameIfNeeded(dynamic_subcolumn, max_dynamic_paths, max_dynamic_types);
/// If there is more data in the buffer - it's subcolumn of a type, append it to the type name.
if (!buf.eof())
dynamic_subcolumn += String(buf.position(), buf.available());
return {String(subcolumn_name.substr(0, pos)), dynamic_subcolumn};
}
/// Sub-object subcolumn in JSON path always looks like "^`some`.path.path".
/// We back quote first path element after `^` so we can distinguish sub-object subcolumn and path element "^path".
std::optional<String> tryGetSubObjectSubcolumn(std::string_view subcolumn_name)
{
if (!subcolumn_name.starts_with("^`"))
return std::nullopt;
ReadBufferFromMemory buf(subcolumn_name.data() + 1);
String path;
/// Try to read back-quoted first path element.
if (!tryReadBackQuotedString(path, buf))
return std::nullopt;
/// Add remaining path elements if any.
return path + String(buf.position(), buf.available());
}
/// Return sub-path by specified prefix.
/// For example, for prefix a.b:
/// a.b.c.d -> c.d, a.b.c -> c
String getSubPath(const String & path, const String & prefix)
{
return path.substr(prefix.size() + 1);
}
std::string_view getSubPath(std::string_view path, const String & prefix)
{
return path.substr(prefix.size() + 1);
}
}
std::unique_ptr<ISerialization::SubstreamData> DataTypeObject::getDynamicSubcolumnData(std::string_view subcolumn_name, const SubstreamData & data, bool throw_if_null) const
{
/// Check if it's sub-object subcolumn.
/// In this case we should return JSON column with all paths that are inside specified object prefix.
/// For example, if we have {"a" : {"b" : {"c" : {"d" : 10, "e" : "Hello"}, "f" : [1, 2, 3]}}} and subcolumn ^a.b
/// we should return JSON column with data {"c" : {"d" : 10, "e" : Hello}, "f" : [1, 2, 3]}
if (auto sub_object_subcolumn = tryGetSubObjectSubcolumn(subcolumn_name))
{
const String & prefix = *sub_object_subcolumn;
/// Collect new typed paths.
std::unordered_map<String, DataTypePtr> typed_sub_paths;
/// Collect serializations for typed paths. They will be needed for sub-object subcolumn deserialization.
std::unordered_map<String, SerializationPtr> typed_paths_serializations;
for (const auto & [path, type] : typed_paths)
{
if (path.starts_with(prefix) && path.size() != prefix.size())
{
typed_sub_paths[getSubPath(path, prefix)] = type;
typed_paths_serializations[path] = type->getDefaultSerialization();
}
}
const auto * literal = schema_argument->as<ASTLiteral>();
if (!literal || literal->value.getType() != Field::Types::String)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE,
"Object data type family must have a const string as its schema name parameter");
std::unique_ptr<SubstreamData> res = std::make_unique<SubstreamData>(std::make_shared<SerializationSubObject>(prefix, typed_paths_serializations));
/// Keep all current constraints like limits and skip paths/prefixes/regexps.
res->type = std::make_shared<DataTypeObject>(schema_format, typed_sub_paths, paths_to_skip, path_regexps_to_skip, max_dynamic_paths, max_dynamic_types);
/// If column was provided, we should create a column for the requested subcolumn.
if (data.column)
{
const auto & object_column = assert_cast<const ColumnObject &>(*data.column);
return std::make_shared<DataTypeObject>(literal->value.safeGet<const String &>(), is_nullable);
auto result_column = res->type->createColumn();
auto & result_object_column = assert_cast<ColumnObject &>(*result_column);
/// Iterate over all typed/dynamic/shared data paths and collect all paths with specified prefix.
auto & result_typed_columns = result_object_column.getTypedPaths();
for (const auto & [path, column] : object_column.getTypedPaths())
{
if (path.starts_with(prefix) && path.size() != prefix.size())
result_typed_columns[getSubPath(path, prefix)] = column;
}
auto & result_dynamic_columns = result_object_column.getDynamicPaths();
auto & result_dynamic_columns_ptrs = result_object_column.getDynamicPathsPtrs();
for (const auto & [path, column] : object_column.getDynamicPaths())
{
if (path.starts_with(prefix) && path.size() != prefix.size())
{
auto sub_path = getSubPath(path, prefix);
result_dynamic_columns[sub_path] = column;
result_dynamic_columns_ptrs[sub_path] = assert_cast<ColumnDynamic *>(result_dynamic_columns[sub_path].get());
}
}
const auto & shared_data_offsets = object_column.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = object_column.getSharedDataPathsAndValues();
auto & result_shared_data_offsets = result_object_column.getSharedDataOffsets();
result_shared_data_offsets.reserve(shared_data_offsets.size());
auto [result_shared_data_paths, result_shared_data_values] = result_object_column.getSharedDataPathsAndValues();
for (size_t i = 0; i != shared_data_offsets.size(); ++i)
{
size_t start = shared_data_offsets[ssize_t(i) - 1];
size_t end = shared_data_offsets[ssize_t(i)];
size_t lower_bound_index = ColumnObject::findPathLowerBoundInSharedData(prefix, *shared_data_paths, start, end);
for (; lower_bound_index != end; ++lower_bound_index)
{
auto path = shared_data_paths->getDataAt(lower_bound_index).toView();
if (!path.starts_with(prefix))
break;
/// Don't include path that is equal to the prefix.
if (path.size() != prefix.size())
{
auto sub_path = getSubPath(path, prefix);
result_shared_data_paths->insertData(sub_path.data(), sub_path.size());
result_shared_data_values->insertFrom(*shared_data_values, lower_bound_index);
}
}
result_shared_data_offsets.push_back(result_shared_data_paths->size());
}
res->column = std::move(result_column);
}
return res;
}
/// Split requested subcolumn to the JSON path and Dynamic type subcolumn.
auto [path, path_subcolumn] = splitPathAndDynamicTypeSubcolumn(subcolumn_name, max_dynamic_paths, max_dynamic_types);
std::unique_ptr<SubstreamData> res;
if (auto it = typed_paths.find(path); it != typed_paths.end())
{
res = std::make_unique<SubstreamData>(it->second->getDefaultSerialization());
res->type = it->second;
}
else
{
res = std::make_unique<SubstreamData>(std::make_shared<SerializationDynamic>());
res->type = std::make_shared<DataTypeDynamic>();
}
/// If column was provided, we should create a column for requested subcolumn.
if (data.column)
{
const auto & object_column = assert_cast<const ColumnObject &>(*data.column);
/// Try to find requested path in typed paths.
if (auto typed_it = object_column.getTypedPaths().find(path); typed_it != object_column.getTypedPaths().end())
{
res->column = typed_it->second;
}
/// Try to find requested path in dynamic paths.
else if (auto dynamic_it = object_column.getDynamicPaths().find(path); dynamic_it != object_column.getDynamicPaths().end())
{
res->column = dynamic_it->second;
}
/// Extract values of requested path from shared data.
else
{
auto dynamic_column = ColumnDynamic::create(max_dynamic_types);
dynamic_column->reserve(object_column.size());
ColumnObject::fillPathColumnFromSharedData(*dynamic_column, path, object_column.getSharedDataPtr(), 0, object_column.size());
res->column = std::move(dynamic_column);
}
}
/// Get subcolumn for Dynamic type if needed.
if (!path_subcolumn.empty())
{
res = res->type->getSubcolumnData(path_subcolumn, *res, throw_if_null);
if (!res)
return nullptr;
}
if (typed_paths.contains(path))
res->serialization = std::make_shared<SerializationObjectTypedPath>(res->serialization, path);
else
res->serialization = std::make_shared<SerializationObjectDynamicPath>(res->serialization, path, path_subcolumn, max_dynamic_types);
return res;
}
void registerDataTypeObject(DataTypeFactory & factory)
static DataTypePtr createObject(const ASTPtr & arguments, const DataTypeObject::SchemaFormat & schema_format)
{
factory.registerDataType("Object", create);
factory.registerSimpleDataType("JSON",
[] { return std::make_shared<DataTypeObject>("JSON", false); },
DataTypeFactory::Case::Insensitive);
if (!arguments || arguments->children.empty())
return std::make_shared<DataTypeObject>(schema_format);
std::unordered_map<String, DataTypePtr> typed_paths;
std::unordered_set<String> paths_to_skip;
std::vector<String> path_regexps_to_skip;
size_t max_dynamic_types = DataTypeDynamic::DEFAULT_MAX_DYNAMIC_TYPES;
size_t max_dynamic_paths = DataTypeObject::DEFAULT_MAX_SEPARATELY_STORED_PATHS;
for (const auto & argument : arguments->children)
{
const auto * object_type_argument = argument->as<ASTObjectTypeArgument>();
if (object_type_argument->parameter)
{
const auto * function = object_type_argument->parameter->as<ASTFunction>();
if (!function || function->name != "equals")
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected parameter in {} type arguments: {}", magic_enum::enum_name(schema_format), function->formatForErrorMessage());
const auto * identifier = function->arguments->children[0]->as<ASTIdentifier>();
if (!identifier)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected {} type argument: {}. Expected expression 'max_dynamic_types=N' or 'max_dynamic_paths=N'", magic_enum::enum_name(schema_format), function->formatForErrorMessage());
auto identifier_name = identifier->name();
if (identifier_name != "max_dynamic_types" && identifier_name != "max_dynamic_paths")
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected parameter in {} type arguments: {}. Expected 'max_dynamic_types' or `max_dynamic_paths`", magic_enum::enum_name(schema_format), identifier_name);
auto * literal = function->arguments->children[1]->as<ASTLiteral>();
/// Is 1000000 a good maximum for max paths?
size_t max_value = identifier_name == "max_dynamic_types" ? ColumnDynamic::MAX_DYNAMIC_TYPES_LIMIT : 1000000;
if (!literal || literal->value.getType() != Field::Types::UInt64 || literal->value.safeGet<UInt64>() > max_value)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "'{}' parameter for {} type should be a positive integer between 0 and {}. Got {}", identifier_name, magic_enum::enum_name(schema_format), max_value, function->arguments->children[1]->formatForErrorMessage());
if (identifier_name == "max_dynamic_types")
max_dynamic_types = literal->value.safeGet<UInt64>();
else
max_dynamic_paths = literal->value.safeGet<UInt64>();
}
else if (object_type_argument->path_with_type)
{
const auto * path_with_type = object_type_argument->path_with_type->as<ASTNameTypePair>();
auto data_type = DataTypeFactory::instance().get(path_with_type->type);
if (typed_paths.contains(path_with_type->name))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Found duplicated path with type: {}", path_with_type->name);
typed_paths.emplace(path_with_type->name, data_type);
}
else if (object_type_argument->skip_path)
{
const auto * identifier = object_type_argument->skip_path->as<ASTIdentifier>();
if (!identifier)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected AST in SKIP section of {} type arguments: {}. Expected identifier with path name", magic_enum::enum_name(schema_format), object_type_argument->skip_path->formatForErrorMessage());
paths_to_skip.insert(identifier->name());
}
else if (object_type_argument->skip_path_regexp)
{
const auto * literal = object_type_argument->skip_path_regexp->as<ASTLiteral>();
if (!literal || literal->value.getType() != Field::Types::String)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected AST in SKIP section of {} type arguments: {}. Expected identifier with path name", magic_enum::enum_name(schema_format), object_type_argument->skip_path->formatForErrorMessage());
path_regexps_to_skip.push_back(literal->value.safeGet<String>());
}
}
std::sort(path_regexps_to_skip.begin(), path_regexps_to_skip.end());
return std::make_shared<DataTypeObject>(schema_format, std::move(typed_paths), std::move(paths_to_skip), std::move(path_regexps_to_skip), max_dynamic_paths, max_dynamic_types);
}
static DataTypePtr createJSON(const ASTPtr & arguments)
{
return createObject(arguments, DataTypeObject::SchemaFormat::JSON);
}
void registerDataTypeJSON(DataTypeFactory & factory)
{
if (!Context::getGlobalContextInstance()->getSettingsRef().use_json_alias_for_old_object_type)
factory.registerDataType("JSON", createJSON, DataTypeFactory::Case::Insensitive);
}
}

View File

@ -1,48 +1,80 @@
#pragma once
#include <DataTypes/IDataType.h>
#include <DataTypes/DataTypeDynamic.h>
#include <Core/Field.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnObjectDeprecated.h>
#include <Common/re2.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
class DataTypeObject : public IDataType
{
private:
String schema_format;
bool is_nullable;
public:
DataTypeObject(const String & schema_format_, bool is_nullable_);
enum class SchemaFormat
{
JSON = 0,
};
/// Don't change these constants, it can break backward compatibility.
static constexpr size_t DEFAULT_MAX_SEPARATELY_STORED_PATHS = 1024;
static constexpr size_t NESTED_OBJECT_MAX_DYNAMIC_PATHS_REDUCE_FACTOR = 4;
static constexpr size_t NESTED_OBJECT_MAX_DYNAMIC_TYPES_REDUCE_FACTOR = 2;
explicit DataTypeObject(
const SchemaFormat & schema_format_,
std::unordered_map<String, DataTypePtr> typed_paths_ = {},
std::unordered_set<String> paths_to_skip_ = {},
std::vector<String> path_regexps_to_skip_ = {},
size_t max_dynamic_paths_ = DEFAULT_MAX_SEPARATELY_STORED_PATHS,
size_t max_dynamic_types_ = DataTypeDynamic::DEFAULT_MAX_DYNAMIC_TYPES);
DataTypeObject(const SchemaFormat & schema_format_, size_t max_dynamic_paths_, size_t max_dynamic_types_);
const char * getFamilyName() const override { return "Object"; }
String doGetName() const override;
TypeIndex getTypeId() const override { return TypeIndex::Object; }
MutableColumnPtr createColumn() const override { return ColumnObject::create(is_nullable); }
MutableColumnPtr createColumn() const override;
Field getDefault() const override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDefault() is not implemented for data type {}", getName());
}
Field getDefault() const override { return Object(); }
bool haveSubtypes() const override { return false; }
bool equals(const IDataType & rhs) const override;
bool isParametric() const override { return true; }
bool hasDynamicSubcolumnsDeprecated() const override { return true; }
bool canBeInsideNullable() const override { return false; }
bool supportsSparseSerialization() const override { return false; }
bool canBeInsideSparseColumns() const override { return false; }
bool isComparable() const override { return false; }
bool haveSubtypes() const override { return false; }
bool equals(const IDataType & rhs) const override;
bool hasDynamicSubcolumnsData() const override { return true; }
std::unique_ptr<SubstreamData> getDynamicSubcolumnData(std::string_view subcolumn_name, const SubstreamData & data, bool throw_if_null) const override;
SerializationPtr doGetDefaultSerialization() const override;
bool hasNullableSubcolumns() const { return is_nullable; }
const SchemaFormat & getSchemaFormat() const { return schema_format; }
const std::unordered_map<String, DataTypePtr> & getTypedPaths() const { return typed_paths; }
const std::unordered_set<String> & getPathsToSkip() const { return paths_to_skip; }
const std::vector<String> & getPathRegexpsToSkip() const { return path_regexps_to_skip; }
const String & getSchemaFormat() const { return schema_format; }
size_t getMaxDynamicTypes() const { return max_dynamic_types; }
size_t getMaxDynamicPaths() const { return max_dynamic_paths; }
private:
SchemaFormat schema_format;
/// Set of paths with types that were specified in type declaration.
std::unordered_map<String, DataTypePtr> typed_paths;
/// Set of paths that should be skipped during data parsing.
std::unordered_set<String> paths_to_skip;
/// List of regular expressions that should be used to skip paths during data parsing.
std::vector<String> path_regexps_to_skip;
/// Limit on the number of paths that can be stored as subcolumn.
size_t max_dynamic_paths;
/// Limit of dynamic types that should be used for Dynamic columns.
size_t max_dynamic_types;
};
}

View File

@ -0,0 +1,87 @@
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/Serializations/SerializationObjectDeprecated.h>
#include <Parsers/IAST.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTDataType.h>
#include <IO/Operators.h>
#include <Interpreters/Context.h>
#include <Core/Settings.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int UNEXPECTED_AST_STRUCTURE;
}
DataTypeObjectDeprecated::DataTypeObjectDeprecated(const String & schema_format_, bool is_nullable_)
: schema_format(Poco::toLower(schema_format_))
, is_nullable(is_nullable_)
{
}
bool DataTypeObjectDeprecated::equals(const IDataType & rhs) const
{
if (const auto * object = typeid_cast<const DataTypeObjectDeprecated *>(&rhs))
return schema_format == object->schema_format && is_nullable == object->is_nullable;
return false;
}
SerializationPtr DataTypeObjectDeprecated::doGetDefaultSerialization() const
{
return getObjectSerialization(schema_format);
}
String DataTypeObjectDeprecated::doGetName() const
{
WriteBufferFromOwnString out;
if (is_nullable)
out << "Object(Nullable(" << quote << schema_format << "))";
else
out << "Object(" << quote << schema_format << ")";
return out.str();
}
static DataTypePtr create(const ASTPtr & arguments)
{
if (!arguments || arguments->children.size() != 1)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Object data type family must have one argument - name of schema format");
ASTPtr schema_argument = arguments->children[0];
bool is_nullable = false;
if (const auto * type = schema_argument->as<ASTDataType>())
{
if (type->name != "Nullable" || type->arguments->children.size() != 1)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE,
"Expected 'Nullable(<schema_name>)' as parameter for type Object (function: {})", type->name);
schema_argument = type->arguments->children[0];
is_nullable = true;
}
const auto * literal = schema_argument->as<ASTLiteral>();
if (!literal || literal->value.getType() != Field::Types::String)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE,
"Object data type family must have a const string as its schema name parameter");
return std::make_shared<DataTypeObjectDeprecated>(literal->value.safeGet<const String &>(), is_nullable);
}
void registerDataTypeObjectDeprecated(DataTypeFactory & factory)
{
factory.registerDataType("Object", create);
if (Context::getGlobalContextInstance()->getSettingsRef().use_json_alias_for_old_object_type)
factory.registerSimpleDataType("JSON",
[] { return std::make_shared<DataTypeObjectDeprecated>("JSON", false); },
DataTypeFactory::Case::Insensitive);
}
}

View File

@ -0,0 +1,48 @@
#pragma once
#include <DataTypes/IDataType.h>
#include <Core/Field.h>
#include <Columns/ColumnObjectDeprecated.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
class DataTypeObjectDeprecated : public IDataType
{
private:
String schema_format;
bool is_nullable;
public:
DataTypeObjectDeprecated(const String & schema_format_, bool is_nullable_);
const char * getFamilyName() const override { return "Object"; }
String doGetName() const override;
TypeIndex getTypeId() const override { return TypeIndex::ObjectDeprecated; }
MutableColumnPtr createColumn() const override { return ColumnObjectDeprecated::create(is_nullable); }
Field getDefault() const override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDefault() is not implemented for data type {}", getName());
}
bool haveSubtypes() const override { return false; }
bool equals(const IDataType & rhs) const override;
bool isParametric() const override { return true; }
bool hasDynamicSubcolumnsDeprecated() const override { return true; }
SerializationPtr doGetDefaultSerialization() const override;
bool hasNullableSubcolumns() const { return is_nullable; }
const String & getSchemaFormat() const { return schema_format; }
};
}

View File

@ -11,7 +11,7 @@
#include <DataTypes/Serializations/SerializationTuple.h>
#include <DataTypes/Serializations/SerializationNamed.h>
#include <DataTypes/Serializations/SerializationInfoTuple.h>
#include <DataTypes/Serializations/SerializationVariantElement.h>
#include <DataTypes/Serializations/SerializationWrapper.h>
#include <DataTypes/NestedUtils.h>
#include <Parsers/IAST.h>
#include <Parsers/ASTNameTypePair.h>

View File

@ -9,6 +9,7 @@
#include <DataTypes/DataTypeFunction.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeUUID.h>
@ -94,8 +95,13 @@ enum class BinaryTypeIndex : uint8_t
Bool = 0x2D,
SimpleAggregateFunction = 0x2E,
Nested = 0x2F,
JSON = 0x30,
};
/// In future we can introduce more arguments in the JSON data type definition.
/// To support such changes, use versioning in the serialization of JSON type.
const UInt8 TYPE_JSON_SERIALIZATION_VERSION = 0;
BinaryTypeIndex getBinaryTypeIndex(const DataTypePtr & type)
{
/// By default custom types don't have their own BinaryTypeIndex.
@ -202,7 +208,7 @@ BinaryTypeIndex getBinaryTypeIndex(const DataTypePtr & type)
return BinaryTypeIndex::LowCardinality;
case TypeIndex::Map:
return BinaryTypeIndex::Map;
case TypeIndex::Object:
case TypeIndex::ObjectDeprecated:
/// Object type will be deprecated and replaced by new implementation. No need to support it here.
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Binary encoding of type Object is not supported");
case TypeIndex::IPv4:
@ -216,6 +222,15 @@ BinaryTypeIndex getBinaryTypeIndex(const DataTypePtr & type)
/// JSONPaths is used only during schema inference and cannot be used anywhere else.
case TypeIndex::JSONPaths:
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Binary encoding of type JSONPaths is not supported");
case TypeIndex::Object:
{
const auto & object_type = assert_cast<const DataTypeObject &>(*type);
switch (object_type.getSchemaFormat())
{
case DataTypeObject::SchemaFormat::JSON:
return BinaryTypeIndex::JSON;
}
}
}
}
@ -480,6 +495,30 @@ void encodeDataType(const DataTypePtr & type, WriteBuffer & buf)
writeStringBinary(type_name, buf);
break;
}
case BinaryTypeIndex::JSON:
{
const auto & object_type = assert_cast<const DataTypeObject &>(*type);
/// Write version of the serialization because we can add new arguments in the JSON type.
writeBinary(TYPE_JSON_SERIALIZATION_VERSION, buf);
writeVarUInt(object_type.getMaxDynamicPaths(), buf);
writeBinary(UInt8(object_type.getMaxDynamicTypes()), buf);
const auto & typed_paths = object_type.getTypedPaths();
writeVarUInt(typed_paths.size(), buf);
for (const auto & [path, path_type] : typed_paths)
{
writeStringBinary(path, buf);
encodeDataType(path_type, buf);
}
const auto & paths_to_skip = object_type.getPathsToSkip();
writeVarUInt(paths_to_skip.size(), buf);
for (const auto & path : paths_to_skip)
writeStringBinary(path, buf);
const auto & path_regexps_to_skip = object_type.getPathRegexpsToSkip();
writeVarUInt(path_regexps_to_skip.size(), buf);
for (const auto & regexp : path_regexps_to_skip)
writeStringBinary(regexp, buf);
break;
}
default:
break;
}
@ -691,6 +730,54 @@ DataTypePtr decodeDataType(ReadBuffer & buf)
readStringBinary(type_name, buf);
return DataTypeFactory::instance().get(type_name);
}
case BinaryTypeIndex::JSON:
{
UInt8 serialization_version;
readBinary(serialization_version, buf);
if (serialization_version > TYPE_JSON_SERIALIZATION_VERSION)
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected version of JSON type binary encoding");
size_t max_dynamic_paths;
readVarUInt(max_dynamic_paths, buf);
UInt8 max_dynamic_types;
readBinary(max_dynamic_types, buf);
size_t typed_paths_size;
readVarUInt(typed_paths_size, buf);
std::unordered_map<String, DataTypePtr> typed_paths;
for (size_t i = 0; i != typed_paths_size; ++i)
{
String path;
readStringBinary(path, buf);
typed_paths[path] = decodeDataType(buf);
}
size_t paths_to_skip_size;
readVarUInt(paths_to_skip_size, buf);
std::unordered_set<String> paths_to_skip;
paths_to_skip.reserve(paths_to_skip_size);
for (size_t i = 0; i != paths_to_skip_size; ++i)
{
String path;
readStringBinary(path, buf);
paths_to_skip.insert(path);
}
size_t path_regexps_to_skip_size;
readVarUInt(path_regexps_to_skip_size, buf);
std::vector<String> path_regexps_to_skip;
path_regexps_to_skip.reserve(path_regexps_to_skip_size);
for (size_t i = 0; i != path_regexps_to_skip_size; ++i)
{
String regexp;
readStringBinary(regexp, buf);
path_regexps_to_skip.push_back(regexp);
}
return std::make_shared<DataTypeObject>(
DataTypeObject::SchemaFormat::JSON,
typed_paths,
paths_to_skip,
path_regexps_to_skip,
max_dynamic_paths,
max_dynamic_types);
}
}
throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown type code: {0:#04x}", UInt64(type));

View File

@ -8,9 +8,9 @@ namespace DB
/**
Binary encoding for ClickHouse data types:
|------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| ClickHouse data type | Binary encoding |
|------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Nothing | 0x00 |
| UInt8 | 0x01 |
| UInt16 | 0x02 |
@ -44,7 +44,7 @@ Binary encoding for ClickHouse data types:
| Array(T) | 0x1E<nested_type_encoding> |
| Tuple(T1, ..., TN) | 0x1F<var_uint_number_of_elements><nested_type_encoding_1>...<nested_type_encoding_N> |
| Tuple(name1 T1, ..., nameN TN) | 0x20<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N> |
| Set | 0x21 |
| Set| 0x21 |
| Interval | 0x22<interval_kind> |
| Nullable(T) | 0x23<nested_type_encoding> |
| Function | 0x24<var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N><return_type_encoding> |
@ -59,7 +59,8 @@ Binary encoding for ClickHouse data types:
| Bool | 0x2D |
| SimpleAggregateFunction(function_name(param_1, ..., param_N), arg_T1, ..., arg_TN) | 0x2E<var_uint_function_name_size><function_name_data><var_uint_number_of_parameters><param_1>...<param_N><var_uint_number_of_arguments><argument_type_encoding_1>...<argument_type_encoding_N> |
| Nested(name1 T1, ..., nameN TN) | 0x2F<var_uint_number_of_elements><var_uint_name_size_1><name_data_1><nested_type_encoding_1>...<var_uint_name_size_N><name_data_N><nested_type_encoding_N> |
|------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| JSON(max_dynamic_paths=N, max_dynamic_types=M, path Type, SKIP skip_path, SKIP REGEXP skip_path_regexp) | 0x30<uint8_serialization_version><var_int_max_dynamic_paths><uint8_max_dynamic_types><var_uint_number_of_typed_paths><var_uint_path_name_size_1><path_name_data_1><encoded_type_1>...<var_uint_number_of_skip_paths><var_uint_skip_path_size_1><skip_path_data_1>...<var_uint_number_of_skip_path_regexps><var_uint_skip_path_regexp_size_1><skip_path_data_regexp_1>... |
|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
Interval kind binary encoding:
|---------------|-----------------|

View File

@ -178,8 +178,7 @@ DataTypePtr FieldToDataType<on_error>::operator() (const Map & map) const
template <LeastSupertypeOnError on_error>
DataTypePtr FieldToDataType<on_error>::operator() (const Object &) const
{
/// TODO: Do we need different parameters for type Object?
return std::make_shared<DataTypeObject>("json", false);
return std::make_shared<DataTypeObject>(DataTypeObject::SchemaFormat::JSON);
}
template <LeastSupertypeOnError on_error>

View File

@ -363,9 +363,10 @@ bool isArray(TYPE data_type) { return WhichDataType(data_type).isArray(); } \
bool isTuple(TYPE data_type) { return WhichDataType(data_type).isTuple(); } \
bool isMap(TYPE data_type) {return WhichDataType(data_type).isMap(); } \
bool isInterval(TYPE data_type) {return WhichDataType(data_type).isInterval(); } \
bool isObject(TYPE data_type) { return WhichDataType(data_type).isObject(); } \
bool isObjectDeprecated(TYPE data_type) { return WhichDataType(data_type).isObjectDeprecated(); } \
bool isVariant(TYPE data_type) { return WhichDataType(data_type).isVariant(); } \
bool isDynamic(TYPE data_type) { return WhichDataType(data_type).isDynamic(); } \
bool isObject(TYPE data_type) { return WhichDataType(data_type).isObject(); } \
bool isNothing(TYPE data_type) { return WhichDataType(data_type).isNothing(); } \
\
bool isColumnedAsNumber(TYPE data_type) \

View File

@ -432,7 +432,7 @@ struct WhichDataType
constexpr bool isMap() const {return idx == TypeIndex::Map; }
constexpr bool isSet() const { return idx == TypeIndex::Set; }
constexpr bool isInterval() const { return idx == TypeIndex::Interval; }
constexpr bool isObject() const { return idx == TypeIndex::Object; }
constexpr bool isObjectDeprecated() const { return idx == TypeIndex::ObjectDeprecated; }
constexpr bool isNothing() const { return idx == TypeIndex::Nothing; }
constexpr bool isNullable() const { return idx == TypeIndex::Nullable; }
@ -444,6 +444,7 @@ struct WhichDataType
constexpr bool isVariant() const { return idx == TypeIndex::Variant; }
constexpr bool isDynamic() const { return idx == TypeIndex::Dynamic; }
constexpr bool isObject() const { return idx == TypeIndex::Object; }
};
/// IDataType helpers (alternative for IDataType virtual methods with single point of truth)
@ -502,9 +503,10 @@ bool isArray(TYPE data_type); \
bool isTuple(TYPE data_type); \
bool isMap(TYPE data_type); \
bool isInterval(TYPE data_type); \
bool isObject(TYPE data_type); \
bool isObjectDeprecated(TYPE data_type); \
bool isVariant(TYPE data_type); \
bool isDynamic(TYPE data_type); \
bool isObject(TYPE data_type); \
bool isNothing(TYPE data_type); \
\
bool isColumnedAsNumber(TYPE data_type); \

View File

@ -4,7 +4,7 @@
#include <Analyzer/QueryNode.h>
#include <Analyzer/Utils.h>
#include <DataTypes/ObjectUtils.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeMap.h>
@ -16,7 +16,7 @@
#include <DataTypes/getLeastSupertype.h>
#include <DataTypes/NestedUtils.h>
#include <Storages/StorageSnapshot.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnObjectDeprecated.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnMap.h>
@ -180,12 +180,12 @@ static DataTypePtr recreateTupleWithElements(const DataTypeTuple & type_tuple, c
}
static std::pair<ColumnPtr, DataTypePtr> convertObjectColumnToTuple(
const ColumnObject & column_object, const DataTypeObject & type_object)
const ColumnObjectDeprecated & column_object, const DataTypeObjectDeprecated & type_object)
{
if (!column_object.isFinalized())
{
auto finalized = column_object.cloneFinalized();
const auto & finalized_object = assert_cast<const ColumnObject &>(*finalized);
const auto & finalized_object = assert_cast<const ColumnObjectDeprecated &>(*finalized);
return convertObjectColumnToTuple(finalized_object, type_object);
}
@ -211,9 +211,9 @@ static std::pair<ColumnPtr, DataTypePtr> recursivlyConvertDynamicColumnToTuple(
if (!type->hasDynamicSubcolumnsDeprecated())
return {column, type};
if (const auto * type_object = typeid_cast<const DataTypeObject *>(type.get()))
if (const auto * type_object = typeid_cast<const DataTypeObjectDeprecated *>(type.get()))
{
const auto & column_object = assert_cast<const ColumnObject &>(*column);
const auto & column_object = assert_cast<const ColumnObjectDeprecated &>(*column);
return convertObjectColumnToTuple(column_object, *type_object);
}
@ -369,7 +369,7 @@ static DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool che
for (const auto & [key, subtypes] : subcolumns_types)
{
assert(!subtypes.empty());
if (key.getPath() == ColumnObject::COLUMN_NAME_DUMMY)
if (key.getPath() == ColumnObjectDeprecated::COLUMN_NAME_DUMMY)
continue;
size_t first_dim = getNumberOfDimensions(*subtypes[0]);
@ -385,7 +385,7 @@ static DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool che
if (tuple_paths.empty())
{
tuple_paths.emplace_back(ColumnObject::COLUMN_NAME_DUMMY);
tuple_paths.emplace_back(ColumnObjectDeprecated::COLUMN_NAME_DUMMY);
tuple_types.emplace_back(std::make_shared<DataTypeUInt8>());
}
@ -452,7 +452,7 @@ static DataTypePtr getLeastCommonTypeForDynamicColumnsImpl(
if (!type_in_storage->hasDynamicSubcolumnsDeprecated())
return type_in_storage;
if (isObject(type_in_storage))
if (isObjectDeprecated(type_in_storage))
return getLeastCommonTypeForObject(concrete_types, check_ambiguos_paths);
if (const auto * type_array = typeid_cast<const DataTypeArray *>(type_in_storage.get()))
@ -494,9 +494,9 @@ DataTypePtr createConcreteEmptyDynamicColumn(const DataTypePtr & type_in_storage
if (!type_in_storage->hasDynamicSubcolumnsDeprecated())
return type_in_storage;
if (isObject(type_in_storage))
if (isObjectDeprecated(type_in_storage))
return std::make_shared<DataTypeTuple>(
DataTypes{std::make_shared<DataTypeUInt8>()}, Names{ColumnObject::COLUMN_NAME_DUMMY});
DataTypes{std::make_shared<DataTypeUInt8>()}, Names{ColumnObjectDeprecated::COLUMN_NAME_DUMMY});
if (const auto * type_array = typeid_cast<const DataTypeArray *>(type_in_storage.get()))
return std::make_shared<DataTypeArray>(
@ -838,7 +838,7 @@ DataTypePtr unflattenTuple(const PathsInData & paths, const DataTypes & tuple_ty
return unflattenTuple(paths, tuple_types, tuple_columns).second;
}
std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObject & column)
std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObjectDeprecated & column)
{
const auto & subcolumns = column.getSubcolumns();
@ -846,7 +846,7 @@ std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObject & co
{
auto type = std::make_shared<DataTypeTuple>(
DataTypes{std::make_shared<DataTypeUInt8>()},
Names{ColumnObject::COLUMN_NAME_DUMMY});
Names{ColumnObjectDeprecated::COLUMN_NAME_DUMMY});
return {type->createColumn()->cloneResized(column.size()), type};
}

View File

@ -6,7 +6,7 @@
#include <Storages/ColumnsDescription.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnObjectDeprecated.h>
namespace DB
{
@ -88,7 +88,7 @@ DataTypePtr unflattenTuple(
const PathsInData & paths,
const DataTypes & tuple_types);
std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObject & column);
std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObjectDeprecated & column);
std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
const PathsInData & paths,

View File

@ -202,6 +202,12 @@ String getNameForSubstreamPath(
stream_name += "." + it->variant_element_name + ".null";
else if (it->type == SubstreamType::DynamicStructure)
stream_name += ".dynamic_structure";
else if (it->type == SubstreamType::ObjectStructure)
stream_name += ".object_structure";
else if (it->type == SubstreamType::ObjectSharedData)
stream_name += ".object_shared_data";
else if (it->type == SubstreamType::ObjectTypedPath || it->type == SubstreamType::ObjectDynamicPath)
stream_name += "." + it->object_path_name;
}
return stream_name;
@ -401,7 +407,17 @@ bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t pref
|| path[last_elem].type == Substream::TupleElement
|| path[last_elem].type == Substream::ArraySizes
|| path[last_elem].type == Substream::VariantElement
|| path[last_elem].type == Substream::VariantElementNullMap;
|| path[last_elem].type == Substream::VariantElementNullMap
|| path[last_elem].type == Substream::ObjectTypedPath;
}
bool ISerialization::isEphemeralSubcolumn(const DB::ISerialization::SubstreamPath & path, size_t prefix_len)
{
if (prefix_len == 0 || prefix_len > path.size())
return false;
size_t last_elem = prefix_len - 1;
return path[last_elem].type == Substream::VariantElementNullMap;
}
ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len)

View File

@ -176,8 +176,8 @@ public:
SparseElements,
SparseOffsets,
ObjectStructure,
ObjectData,
DeprecatedObjectStructure,
DeprecatedObjectData,
VariantDiscriminators,
NamedVariantDiscriminators,
@ -189,6 +189,12 @@ public:
DynamicData,
DynamicStructure,
ObjectData,
ObjectTypedPath,
ObjectDynamicPath,
ObjectSharedData,
ObjectStructure,
Regular,
};
@ -203,6 +209,9 @@ public:
/// Name of substream for type from 'named_types'.
String name_of_substream;
/// Path name for Object type elements.
String object_path_name;
/// Data for current substream.
SubstreamData data;
@ -263,13 +272,13 @@ public:
bool use_compact_variant_discriminators_serialization = false;
enum class DynamicStatisticsMode
enum class ObjectAndDynamicStatisticsMode
{
NONE, /// Don't write statistics.
PREFIX, /// Write statistics in prefix.
SUFFIX, /// Write statistics in suffix.
};
DynamicStatisticsMode dynamic_write_statistics = DynamicStatisticsMode::NONE;
ObjectAndDynamicStatisticsMode object_and_dynamic_write_statistics = ObjectAndDynamicStatisticsMode::NONE;
};
struct DeserializeBinaryBulkSettings
@ -290,7 +299,7 @@ public:
/// If not zero, may be used to avoid reallocations while reading column of String type.
double avg_value_size_hint = 0;
bool dynamic_read_statistics = false;
bool object_and_dynamic_read_statistics = false;
};
/// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark.
@ -440,6 +449,10 @@ public:
static bool hasSubcolumnForPath(const SubstreamPath & path, size_t prefix_len);
static SubstreamData createFromPath(const SubstreamPath & path, size_t prefix_len);
/// Returns true if subcolumn doesn't actually stores any data in column and doesn't require a separate stream
/// for writing/reading data. For example, it's a null-map subcolumn of Variant type (it's always constructed from discriminators);.
static bool isEphemeralSubcolumn(const SubstreamPath & path, size_t prefix_len);
protected:
template <typename State, typename StatePtr>
State * checkAndGetState(const StatePtr & state) const;

View File

@ -143,7 +143,7 @@ void SerializationDynamic::serializeBinaryBulkStatePrefix(
}
/// Write statistics in prefix if needed.
if (settings.dynamic_write_statistics == SerializeBinaryBulkSettings::DynamicStatisticsMode::PREFIX)
if (settings.object_and_dynamic_write_statistics == SerializeBinaryBulkSettings::ObjectAndDynamicStatisticsMode::PREFIX)
{
const auto & statistics = column_dynamic.getStatistics();
/// First, write statistics for usual variants.
@ -225,8 +225,8 @@ void SerializationDynamic::deserializeBinaryBulkStatePrefix(
return;
auto dynamic_state = std::make_shared<DeserializeBinaryBulkStateDynamic>();
dynamic_state->structure_state = structure_state;
dynamic_state->variant_serialization = checkAndGetState<DeserializeBinaryBulkStateDynamicStructure>(structure_state)->variant_type->getDefaultSerialization();
dynamic_state->structure_state = std::move(structure_state);
dynamic_state->variant_serialization = checkAndGetState<DeserializeBinaryBulkStateDynamicStructure>(dynamic_state->structure_state)->variant_type->getDefaultSerialization();
settings.path.push_back(Substream::DynamicData);
dynamic_state->variant_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_state->variant_state, cache);
@ -243,7 +243,7 @@ ISerialization::DeserializeBinaryBulkStatePtr SerializationDynamic::deserializeD
DeserializeBinaryBulkStatePtr state = nullptr;
if (auto cached_state = getFromSubstreamsDeserializeStatesCache(cache, settings.path))
{
state = cached_state;
state = std::move(cached_state);
}
else if (auto * structure_stream = settings.getter(settings.path))
{
@ -277,16 +277,12 @@ ISerialization::DeserializeBinaryBulkStatePtr SerializationDynamic::deserializeD
auto variant_type = std::make_shared<DataTypeVariant>(variants);
/// Read statistics.
if (settings.dynamic_read_statistics)
if (settings.object_and_dynamic_read_statistics)
{
ColumnDynamic::Statistics statistics(ColumnDynamic::Statistics::Source::READ);
/// First, read statistics for usual variants.
size_t variant_size;
for (const auto & variant : variant_type->getVariants())
{
readVarUInt(variant_size, *structure_stream);
statistics.variants_statistics[variant->getName()] = variant_size;
}
readVarUInt(statistics.variants_statistics[variant->getName()], *structure_stream);
/// Second, read statistics for shared variants.
size_t statistics_size;
@ -295,8 +291,7 @@ ISerialization::DeserializeBinaryBulkStatePtr SerializationDynamic::deserializeD
for (size_t i = 0; i != statistics_size; ++i)
{
readStringBinary(variant_name, *structure_stream);
readVarUInt(variant_size, *structure_stream);
statistics.shared_variants_statistics[variant_name] = variant_size;
readVarUInt(statistics.shared_variants_statistics[variant_name], *structure_stream);
}
structure_state->statistics = std::make_shared<const ColumnDynamic::Statistics>(std::move(statistics));
@ -320,10 +315,10 @@ void SerializationDynamic::serializeBinaryBulkStateSuffix(
settings.path.pop_back();
if (!stream)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for Dynamic column structure during serialization of binary bulk state prefix");
throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for Dynamic column structure during serialization of binary bulk state suffix");
/// Write statistics in suffix if needed.
if (settings.dynamic_write_statistics == SerializeBinaryBulkSettings::DynamicStatisticsMode::SUFFIX)
if (settings.object_and_dynamic_write_statistics == SerializeBinaryBulkSettings::ObjectAndDynamicStatisticsMode::SUFFIX)
{
/// First, write statistics for usual variants.
for (const auto & variant_name : dynamic_state->variant_names)
@ -348,6 +343,18 @@ void SerializationDynamic::serializeBinaryBulkWithMultipleStreams(
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
size_t tmp_size;
serializeBinaryBulkWithMultipleStreamsAndCountTotalSizeOfVariants(column, offset, limit, settings, state, tmp_size);
}
void SerializationDynamic::serializeBinaryBulkWithMultipleStreamsAndCountTotalSizeOfVariants(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state,
size_t & total_size_of_variants) const
{
const auto & column_dynamic = assert_cast<const ColumnDynamic &>(column);
auto * dynamic_state = checkAndGetState<SerializeBinaryBulkStateDynamic>(state);
@ -361,10 +368,18 @@ void SerializationDynamic::serializeBinaryBulkWithMultipleStreams(
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mismatch of max_dynamic_types parameter of Dynamic. Expected: {}, Got: {}", dynamic_state->max_dynamic_types, column_dynamic.getMaxDynamicTypes());
settings.path.push_back(Substream::DynamicData);
assert_cast<const SerializationVariant &>(*dynamic_state->variant_serialization)
.serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(
*variant_column,
offset,
limit,
settings,
dynamic_state->variant_state,
dynamic_state->statistics.variants_statistics,
total_size_of_variants);
if (dynamic_state->recalculate_statistics)
{
assert_cast<const SerializationVariant &>(*dynamic_state->variant_serialization)
.serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(*variant_column, offset, limit, settings, dynamic_state->variant_state, dynamic_state->statistics.variants_statistics);
/// Calculate statistics for shared variants.
const auto & shared_variant = column_dynamic.getSharedVariant();
if (!shared_variant.empty())
@ -389,10 +404,6 @@ void SerializationDynamic::serializeBinaryBulkWithMultipleStreams(
}
}
}
else
{
assert_cast<const SerializationVariant &>(*dynamic_state->variant_serialization).serializeBinaryBulkWithMultipleStreams(*variant_column, offset, limit, settings, dynamic_state->variant_state);
}
settings.path.pop_back();
}
@ -753,6 +764,12 @@ void SerializationDynamic::serializeTextJSON(const IColumn & column, size_t row_
serializeTextImpl(column, row_num, ostr, settings, nested_serialize);
}
void SerializationDynamic::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const
{
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(column);
dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextJSONPretty(dynamic_column.getVariantColumn(), row_num, ostr, settings, indent);
}
void SerializationDynamic::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
auto read_field = [&settings](ReadBuffer & buf)

View File

@ -1,6 +1,7 @@
#pragma once
#include <DataTypes/Serializations/ISerialization.h>
#include <DataTypes/DataTypeDynamic.h>
#include <Columns/ColumnDynamic.h>
namespace DB
@ -11,7 +12,7 @@ class SerializationDynamicElement;
class SerializationDynamic : public ISerialization
{
public:
explicit SerializationDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_)
explicit SerializationDynamic(size_t max_dynamic_types_ = DataTypeDynamic::DEFAULT_MAX_DYNAMIC_TYPES) : max_dynamic_types(max_dynamic_types_)
{
}
@ -59,6 +60,14 @@ public:
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreamsAndCountTotalSizeOfVariants(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state,
size_t & total_size_of_variants) const;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
@ -89,6 +98,7 @@ public:
bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;

View File

@ -53,6 +53,7 @@ void SerializationDynamicElement::enumerateStreams(
.withColumn(data.column)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state->variant_element_state);
settings.path.back().data = variant_data;
deserialize_state->variant_serialization->enumerateStreams(settings, callback, variant_data);
settings.path.pop_back();
}

View File

@ -0,0 +1,405 @@
#include <DataTypes/Serializations/SerializationJSON.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#if USE_SIMDJSON
#include <Common/JSONParsers/SimdJSONParser.h>
#endif
#if USE_RAPIDJSON
#include <Common/JSONParsers/RapidJSONParser.h>
#endif
#include <Common/JSONParsers/DummyJSONParser.h>
namespace DB
{
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
}
template <typename Parser>
SerializationJSON<Parser>::SerializationJSON(
std::unordered_map<String, SerializationPtr> typed_paths_serializations_,
const std::unordered_set<String> & paths_to_skip_,
const std::vector<String> & path_regexps_to_skip_,
std::unique_ptr<JSONExtractTreeNode<Parser>> json_extract_tree_)
: SerializationObject(std::move(typed_paths_serializations_), paths_to_skip_, path_regexps_to_skip_)
, json_extract_tree(std::move(json_extract_tree_))
{
}
namespace
{
/// Struct that represents elements of the JSON path.
/// "a.b.c" -> ["a", "b", "c"]
struct PathElements
{
explicit PathElements(const String & path)
{
const char * start = path.data();
const char * end = start + path.size();
const char * pos = start;
const char * last_dot_pos = pos - 1;
for (pos = start; pos != end; ++pos)
{
if (*pos == '.')
{
elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1));
last_dot_pos = pos;
}
}
elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1));
}
size_t size() const { return elements.size(); }
std::vector<std::string_view> elements;
};
/// Struct that represents a prefix of a JSON path. Used during output of the JSON object.
struct Prefix
{
/// Shrink current prefix to the common prefix of current prefix and specified path.
/// For example, if current prefix is a.b.c.d and path is a.b.e, then shrink the prefix to a.b.
void shrinkToCommonPrefix(const PathElements & path_elements)
{
/// Don't include last element in path_elements in the prefix.
size_t i = 0;
while (i != elements.size() && i != (path_elements.elements.size() - 1) && elements[i].first == path_elements.elements[i])
++i;
elements.resize(i);
}
/// Check is_first flag in current object.
bool isFirstInCurrentObject() const
{
if (elements.empty())
return root_is_first_flag;
return elements.back().second;
}
/// Set flag is_first = false in current object.
void setNotFirstInCurrentObject()
{
if (elements.empty())
root_is_first_flag = false;
else
elements.back().second = false;
}
size_t size() const { return elements.size(); }
/// Elements of the prefix: (path element, is_first flag in this prefix).
/// is_first flag indicates if we already serialized some key in the object with such prefix.
std::vector<std::pair<std::string_view, bool>> elements;
bool root_is_first_flag = true;
};
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, bool pretty, size_t indent) const
{
const auto & column_object = assert_cast<const ColumnObject &>(column);
const auto & typed_paths = column_object.getTypedPaths();
const auto & dynamic_paths = column_object.getDynamicPaths();
const auto & shared_data_offsets = column_object.getSharedDataOffsets();
const auto [shared_data_paths, shared_data_values] = column_object.getSharedDataPathsAndValues();
size_t shared_data_offset = shared_data_offsets[static_cast<ssize_t>(row_num) - 1];
size_t shared_data_end = shared_data_offsets[static_cast<ssize_t>(row_num)];
/// We need to convert the set of paths in this row to a JSON object.
/// To do it, we first collect all the paths from current row, then we sort them
/// and construct the resulting JSON object by iterating over sorted list of paths.
/// For example:
/// b.c, a.b, a.a, b.e, g, h.u.t -> a.a, a.b, b.c, b.e, g, h.u.t -> {"a" : {"a" : ..., "b" : ...}, "b" : {"c" : ..., "e" : ...}, "g" : ..., "h" : {"u" : {"t" : ...}}}.
std::vector<String> sorted_paths;
sorted_paths.reserve(typed_paths.size() + dynamic_paths.size() + (shared_data_end - shared_data_offset));
for (const auto & [path, _] : typed_paths)
sorted_paths.emplace_back(path);
for (const auto & [path, dynamic_column] : dynamic_paths)
{
/// We consider null value and absence of the path in a row as equivalent cases, because we cannot actually distinguish them.
/// So, we don't output null values at all.
if (!dynamic_column->isNullAt(row_num))
sorted_paths.emplace_back(path);
}
for (size_t i = shared_data_offset; i != shared_data_end; ++i)
{
auto path = shared_data_paths->getDataAt(i).toString();
sorted_paths.emplace_back(path);
}
std::sort(sorted_paths.begin(), sorted_paths.end());
if (pretty)
writeCString("{\n", ostr);
else
writeChar('{', ostr);
size_t index_in_shared_data_values = shared_data_offset;
/// current_prefix represents the path of the object we are currently serializing keys in.
Prefix current_prefix;
for (const auto & path : sorted_paths)
{
PathElements path_elements(path);
/// Change prefix to common prefix between current prefix and current path.
/// If prefix changed (it can only decrease), close all finished objects.
/// For example:
/// Current prefix: a.b.c.d
/// Current path: a.b.e.f
/// It means now we have : {..., "a" : {"b" : {"c" : {"d" : ...
/// Common prefix will be a.b, so it means we should close objects a.b.c.d and a.b.c: {..., "a" : {"b" : {"c" : {"d" : ...}}
/// and continue serializing keys in object a.b
size_t prev_prefix_size = current_prefix.size();
current_prefix.shrinkToCommonPrefix(path_elements);
size_t prefix_size = current_prefix.size();
if (prefix_size != prev_prefix_size)
{
size_t objects_to_close = prev_prefix_size - prefix_size;
if (pretty)
{
writeChar('\n', ostr);
for (size_t i = 0; i != objects_to_close; ++i)
{
writeChar(' ', (indent + prefix_size + objects_to_close - i) * 4, ostr);
if (i != objects_to_close - 1)
writeCString("}\n", ostr);
else
writeChar('}', ostr);
}
}
else
{
for (size_t i = 0; i != objects_to_close; ++i)
writeChar('}', ostr);
}
}
/// Now we are inside object that has common prefix with current path.
/// We should go inside all objects in current path.
/// From the example above we should open object a.b.e:
/// {..., "a" : {"b" : {"c" : {"d" : ...}}, "e" : {
if (prefix_size + 1 < path_elements.size())
{
for (size_t i = prefix_size; i != path_elements.size() - 1; ++i)
{
/// Write comma before the key if it's not the first key in this prefix.
if (!current_prefix.isFirstInCurrentObject())
{
if (pretty)
writeCString(",\n", ostr);
else
writeChar(',', ostr);
}
else
{
current_prefix.setNotFirstInCurrentObject();
}
if (pretty)
{
writeChar(' ', (indent + i + 1) * 4, ostr);
writeJSONString(path_elements.elements[i], ostr, settings);
writeCString(" : {\n", ostr);
}
else
{
writeJSONString(path_elements.elements[i], ostr, settings);
writeCString(":{", ostr);
}
/// Update current prefix.
current_prefix.elements.emplace_back(path_elements.elements[i], true);
}
}
/// Write comma before the key if it's not the first key in this prefix.
if (!current_prefix.isFirstInCurrentObject())
{
if (pretty)
writeCString(",\n", ostr);
else
writeChar(',', ostr);
}
else
{
current_prefix.setNotFirstInCurrentObject();
}
if (pretty)
{
writeChar(' ', (indent + current_prefix.size() + 1) * 4, ostr);
writeJSONString(path_elements.elements.back(), ostr, settings);
writeCString(" : ", ostr);
}
else
{
writeJSONString(path_elements.elements.back(), ostr, settings);
writeCString(":", ostr);
}
/// Serialize value of current path.
if (auto typed_it = typed_paths.find(path); typed_it != typed_paths.end())
{
if (pretty)
typed_path_serializations.at(path)->serializeTextJSONPretty(*typed_it->second, row_num, ostr, settings, indent + current_prefix.size() + 1);
else
typed_path_serializations.at(path)->serializeTextJSON(*typed_it->second, row_num, ostr, settings);
}
else if (auto dynamic_it = dynamic_paths.find(path); dynamic_it != dynamic_paths.end())
{
if (pretty)
dynamic_serialization->serializeTextJSONPretty(*dynamic_it->second, row_num, ostr, settings, indent + current_prefix.size() + 1);
else
dynamic_serialization->serializeTextJSON(*dynamic_it->second, row_num, ostr, settings);
}
else
{
/// To serialize value stored in shared data we should first deserialize it from binary format.
auto tmp_dynamic_column = ColumnDynamic::create();
tmp_dynamic_column->reserve(1);
column_object.deserializeValueFromSharedData(shared_data_values, index_in_shared_data_values++, *tmp_dynamic_column);
if (pretty)
dynamic_serialization->serializeTextJSONPretty(*tmp_dynamic_column, 0, ostr, settings, indent + current_prefix.size() + 1);
else
dynamic_serialization->serializeTextJSON(*tmp_dynamic_column, 0, ostr, settings);
}
}
/// Close all remaining open objects.
if (pretty)
{
writeChar('\n', ostr);
for (size_t i = 0; i != current_prefix.elements.size(); ++i)
{
writeChar(' ', (indent + current_prefix.size() - i) * 4, ostr);
writeCString("}\n", ostr);
}
writeChar(' ', indent * 4, ostr);
writeChar('}', ostr);
}
else
{
for (size_t i = 0; i != current_prefix.elements.size(); ++i)
writeChar('}', ostr);
writeChar('}', ostr);
}
}
template <typename Parser>
void SerializationJSON<Parser>::deserializeTextImpl(IColumn & column, std::string_view object, const FormatSettings & settings) const
{
typename Parser::Element document;
auto parser = parsers_pool.get([] { return new Parser; });
if (!parser->parse(object, document))
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object here: {}", object);
String error;
if (!json_extract_tree->insertResultToColumn(column, document, insert_settings, settings, error))
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot insert data into JSON column: {}", error);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeTextImpl(column, row_num, ostr, settings);
}
template <typename Parser>
void SerializationJSON<Parser>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String object;
readStringUntilEOF(object, istr);
deserializeTextImpl(column, object, settings);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString buf;
serializeTextImpl(column, row_num, buf, settings);
writeEscapedString(buf.str(), ostr);
}
template <typename Parser>
void SerializationJSON<Parser>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String object;
readEscapedString(object, istr);
deserializeTextImpl(column, object, settings);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString buf;
serializeTextImpl(column, row_num, buf, settings);
writeQuotedString(buf.str(), ostr);
}
template <typename Parser>
void SerializationJSON<Parser>::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String object;
readQuotedString(object, istr);
deserializeTextImpl(column, object, settings);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString buf;
serializeTextImpl(column, row_num, buf, settings);
writeCSVString(buf.str(), ostr);
}
template <typename Parser>
void SerializationJSON<Parser>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String object;
readCSVString(object, istr, settings.csv);
deserializeTextImpl(column, object, settings);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString buf;
serializeTextImpl(column, row_num, buf, settings);
writeXMLStringForTextElement(buf.str(), ostr);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeTextImpl(column, row_num, ostr, settings);
}
template <typename Parser>
void SerializationJSON<Parser>::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const
{
serializeTextImpl(column, row_num, ostr, settings, true, indent);
}
template <typename Parser>
void SerializationJSON<Parser>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String object_buffer;
auto object_view = readJSONObjectAsViewPossiblyInvalid(istr, object_buffer);
deserializeTextImpl(column, object_view, settings);
}
#if USE_SIMDJSON
template class SerializationJSON<SimdJSONParser>;
#endif
#if USE_RAPIDJSON
template class SerializationJSON<RapidJSONParser>;
#else
template class SerializationJSON<DummyJSONParser>;
#endif
}

View File

@ -0,0 +1,49 @@
#pragma once
#include <DataTypes/Serializations/SerializationObject.h>
#include <Formats/JSONExtractTree.h>
#include <Common/ObjectPool.h>
namespace DB
{
/// Class for text serialization/deserialization of the JSON data type.
template <typename Parser>
class SerializationJSON : public SerializationObject
{
public:
SerializationJSON(
std::unordered_map<String, SerializationPtr> typed_paths_serializations_,
const std::unordered_set<String> & paths_to_skip_,
const std::vector<String> & path_regexps_to_skip_,
std::unique_ptr<JSONExtractTreeNode<Parser>> json_extract_tree_);
void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
private:
void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, bool pretty = false, size_t indent = 0) const;
void deserializeTextImpl(IColumn & column, std::string_view object, const FormatSettings & settings) const;
std::unique_ptr<JSONExtractTreeNode<Parser>> json_extract_tree;
JSONExtractInsertSettings insert_settings;
/// Pool of parser objects to make SerializationJSON thread safe.
mutable SimpleObjectPool<Parser> parsers_pool;
};
}

View File

@ -268,9 +268,16 @@ void SerializationLowCardinality::serializeBinaryBulkStateSuffix(
void SerializationLowCardinality::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * /*cache*/) const
SubstreamsDeserializeStatesCache * cache) const
{
settings.path.push_back(Substream::DictionaryKeys);
if (auto cached_state = getFromSubstreamsDeserializeStatesCache(cache, settings.path))
{
state = std::move(cached_state);
return;
}
auto * stream = settings.getter(settings.path);
settings.path.pop_back();

File diff suppressed because it is too large Load Diff

View File

@ -1,34 +1,43 @@
#pragma once
#include <Columns/ColumnObject.h>
#include <DataTypes/Serializations/SimpleTextSerialization.h>
#include <Common/ObjectPool.h>
#include <DataTypes/DataTypeObject.h>
#include <list>
namespace DB
{
/** Serialization for data type Object.
* Supported only text serialization/deserialization.
* and binary bulk serialization/deserialization without position independent
* encoding, i.e. serialization/deserialization into Native format.
*/
template <typename Parser>
class SerializationObjectDynamicPath;
class SerializationSubObject;
/// Class for binary serialization/deserialization of an Object type (currently only JSON).
class SerializationObject : public ISerialization
{
public:
/** In Native format ColumnObject can be serialized
* in two formats: as Tuple or as String.
* The format is the following:
*
* <serialization_kind> 1 byte -- 0 if Tuple, 1 if String.
* [type_name] -- Only for tuple serialization.
* ... data of internal column ...
*
* ClickHouse client serializazes objects as tuples.
* String serialization exists for clients, which cannot
* do parsing by themselves and they can send raw data as
* string. It will be parsed on the server side.
*/
/// Serialization can change in future. Let's introduce serialization version.
struct ObjectSerializationVersion
{
enum Value
{
BASIC = 0,
};
Value value;
static void checkVersion(UInt64 version);
explicit ObjectSerializationVersion(UInt64 version);
};
SerializationObject(
std::unordered_map<String, SerializationPtr> typed_path_serializations_,
const std::unordered_set<String> & paths_to_skip_,
const std::vector<String> & path_regexps_to_skip_);
void enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
const IColumn & column,
@ -63,59 +72,55 @@ public:
void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override;
void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override;
void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
static void restoreColumnObject(ColumnObject & column_object, size_t prev_size);
private:
enum class BinarySerializationKind : UInt8
friend SerializationObjectDynamicPath;
friend SerializationSubObject;
/// State of an Object structure. Can be also used during deserializing of Object subcolumns.
struct DeserializeBinaryBulkStateObjectStructure : public ISerialization::DeserializeBinaryBulkState
{
TUPLE = 0,
STRING = 1,
ObjectSerializationVersion structure_version;
size_t max_dynamic_paths;
std::vector<String> sorted_dynamic_paths;
std::unordered_set<String> dynamic_paths;
/// Paths statistics. Map (dynamic path) -> (number of non-null values in this path).
ColumnObject::StatisticsPtr statistics;
explicit DeserializeBinaryBulkStateObjectStructure(UInt64 structure_version_) : structure_version(structure_version_) {}
};
struct SerializeStateObject;
struct DeserializeStateObject;
void deserializeBinaryBulkFromString(
ColumnObject & column_object,
size_t limit,
static DeserializeBinaryBulkStatePtr deserializeObjectStructureStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeStateObject & state,
SubstreamsCache * cache) const;
SubstreamsDeserializeStatesCache * cache);
void deserializeBinaryBulkFromTuple(
ColumnObject & column_object,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeStateObject & state,
SubstreamsCache * cache) const;
/// Shared data has type Array(Tuple(String, String)).
static const DataTypePtr & getTypeOfSharedData();
template <typename TSettings>
void checkSerializationIsSupported(const TSettings & settings) const;
struct TypedPathSubcolumnCreator : public ISubcolumnCreator
{
String path;
template <typename Reader>
void deserializeTextImpl(IColumn & column, Reader && reader) const;
explicit TypedPathSubcolumnCreator(const String & path_) : path(path_) {}
void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const;
DataTypePtr create(const DataTypePtr & prev) const override { return prev; }
ColumnPtr create(const ColumnPtr & prev) const override { return prev; }
SerializationPtr create(const SerializationPtr & prev) const override;
};
template <bool pretty_json = false>
void serializeTextFromSubcolumn(const ColumnObject::Subcolumn & subcolumn, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent = 0) const;
protected:
bool shouldSkipPath(const String & path) const;
/// Pool of parser objects to make SerializationObject thread safe.
mutable SimpleObjectPool<Parser> parsers_pool;
std::unordered_map<String, SerializationPtr> typed_path_serializations;
std::unordered_set<String> paths_to_skip;
std::vector<String> sorted_paths_to_skip;
std::list<re2::RE2> path_regexps_to_skip;
SerializationPtr dynamic_serialization;
private:
std::vector<String> sorted_typed_paths;
SerializationPtr shared_data_serialization;
};
SerializationPtr getObjectSerialization(const String & schema_format);
}

View File

@ -0,0 +1,586 @@
#include <DataTypes/Serializations/SerializationObjectDeprecated.h>
#include <DataTypes/Serializations/JSONDataParser.h>
#include <DataTypes/Serializations/SerializationString.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/ObjectUtils.h>
#include <DataTypes/DataTypeFactory.h>
#include <Common/JSONParsers/SimdJSONParser.h>
#include <Common/JSONParsers/RapidJSONParser.h>
#include <Columns/ColumnObjectDeprecated.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnString.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/VarInt.h>
#include <magic_enum.hpp>
#include <memory>
#include <string>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
extern const int INCORRECT_DATA;
extern const int CANNOT_READ_ALL_DATA;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int CANNOT_PARSE_TEXT;
extern const int EXPERIMENTAL_FEATURE_ERROR;
}
template <typename Parser>
template <typename Reader>
void SerializationObjectDeprecated<Parser>::deserializeTextImpl(IColumn & column, Reader && reader) const
{
auto & column_object = assert_cast<ColumnObjectDeprecated &>(column);
String buf;
reader(buf);
std::optional<ParseResult> result;
/// Treat empty string as an empty object
/// for better CAST from String to Object.
if (!buf.empty())
{
auto parser = parsers_pool.get([] { return new Parser; });
result = parser->parse(buf.data(), buf.size());
}
else
{
result = ParseResult{};
}
if (!result)
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse object");
auto & [paths, values] = *result;
assert(paths.size() == values.size());
size_t old_column_size = column_object.size();
for (size_t i = 0; i < paths.size(); ++i)
{
auto field_info = getFieldInfo(values[i]);
if (field_info.need_fold_dimension)
values[i] = applyVisitor(FieldVisitorFoldDimension(field_info.num_dimensions), std::move(values[i]));
if (isNothing(field_info.scalar_type))
continue;
if (!column_object.hasSubcolumn(paths[i]))
{
if (paths[i].hasNested())
column_object.addNestedSubcolumn(paths[i], field_info, old_column_size);
else
column_object.addSubcolumn(paths[i], old_column_size);
}
auto & subcolumn = column_object.getSubcolumn(paths[i]);
assert(subcolumn.size() == old_column_size);
subcolumn.insert(std::move(values[i]), std::move(field_info));
}
/// Insert default values to missed subcolumns.
const auto & subcolumns = column_object.getSubcolumns();
for (const auto & entry : subcolumns)
{
if (entry->data.size() == old_column_size)
{
bool inserted = column_object.tryInsertDefaultFromNested(entry);
if (!inserted)
entry->data.insertDefault();
}
}
column_object.incrementNumRows();
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
deserializeTextImpl(column, [&](String & s) { readStringInto(s, istr); });
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
deserializeTextImpl(column, [&](String & s) { settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(s, istr) : readEscapedString(s, istr); });
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
deserializeTextImpl(column, [&](String & s) { readQuotedStringInto<true>(s, istr); });
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
deserializeTextImpl(column, [&](String & s) { Parser::readJSON(s, istr); });
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
deserializeTextImpl(column, [&](String & s) { readCSVStringInto(s, istr, settings.csv); });
}
template <typename Parser>
template <typename TSettings>
void SerializationObjectDeprecated<Parser>::checkSerializationIsSupported(const TSettings & settings) const
{
if (settings.position_independent_encoding)
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"DataTypeObject doesn't support serialization with position independent encoding");
}
template <typename Parser>
struct SerializationObjectDeprecated<Parser>::SerializeStateObject : public ISerialization::SerializeBinaryBulkState
{
DataTypePtr nested_type;
SerializationPtr nested_serialization;
SerializeBinaryBulkStatePtr nested_state;
};
template <typename Parser>
struct SerializationObjectDeprecated<Parser>::DeserializeStateObject : public ISerialization::DeserializeBinaryBulkState
{
BinarySerializationKind kind;
DataTypePtr nested_type;
SerializationPtr nested_serialization;
DeserializeBinaryBulkStatePtr nested_state;
};
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
checkSerializationIsSupported(settings);
if (state)
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"DataTypeObject doesn't support serialization with non-trivial state");
const auto & column_object = assert_cast<const ColumnObjectDeprecated &>(column);
if (!column_object.isFinalized())
{
auto finalized = column_object.cloneFinalized();
serializeBinaryBulkStatePrefix(*finalized, settings, state);
return;
}
settings.path.push_back(Substream::DeprecatedObjectStructure);
auto * stream = settings.getter(settings.path);
if (!stream)
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Missing stream for kind of binary serialization");
auto [tuple_column, tuple_type] = unflattenObjectToTuple(column_object);
writeIntBinary(static_cast<UInt8>(BinarySerializationKind::TUPLE), *stream);
writeStringBinary(tuple_type->getName(), *stream);
auto state_object = std::make_shared<SerializeStateObject>();
state_object->nested_type = tuple_type;
state_object->nested_serialization = tuple_type->getDefaultSerialization();
settings.path.back() = Substream::DeprecatedObjectData;
state_object->nested_serialization->serializeBinaryBulkStatePrefix(*tuple_column, settings, state_object->nested_state);
state = std::move(state_object);
settings.path.pop_back();
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
checkSerializationIsSupported(settings);
auto * state_object = checkAndGetState<SerializeStateObject>(state);
settings.path.push_back(Substream::DeprecatedObjectData);
state_object->nested_serialization->serializeBinaryBulkStateSuffix(settings, state_object->nested_state);
settings.path.pop_back();
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const
{
checkSerializationIsSupported(settings);
if (state)
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"DataTypeObject doesn't support serialization with non-trivial state");
settings.path.push_back(Substream::DeprecatedObjectStructure);
auto * stream = settings.getter(settings.path);
settings.path.pop_back();
if (!stream)
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA,
"Cannot read kind of binary serialization of DataTypeObject, because its stream is missing");
UInt8 kind_raw;
readIntBinary(kind_raw, *stream);
auto kind = magic_enum::enum_cast<BinarySerializationKind>(kind_raw);
if (!kind)
throw Exception(ErrorCodes::INCORRECT_DATA,
"Unknown binary serialization kind of Object: {}", std::to_string(kind_raw));
auto state_object = std::make_shared<DeserializeStateObject>();
state_object->kind = *kind;
if (state_object->kind == BinarySerializationKind::TUPLE)
{
String data_type_name;
readStringBinary(data_type_name, *stream);
state_object->nested_type = DataTypeFactory::instance().get(data_type_name);
state_object->nested_serialization = state_object->nested_type->getDefaultSerialization();
if (!isTuple(state_object->nested_type))
throw Exception(ErrorCodes::INCORRECT_DATA,
"Data of type Object should be written as Tuple, got: {}", data_type_name);
}
else if (state_object->kind == BinarySerializationKind::STRING)
{
state_object->nested_type = std::make_shared<DataTypeString>();
state_object->nested_serialization = std::make_shared<SerializationString>();
}
else
{
throw Exception(ErrorCodes::INCORRECT_DATA,
"Unknown binary serialization kind of Object: {}", std::to_string(kind_raw));
}
settings.path.push_back(Substream::DeprecatedObjectData);
state_object->nested_serialization->deserializeBinaryBulkStatePrefix(settings, state_object->nested_state, cache);
settings.path.pop_back();
state = std::move(state_object);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
checkSerializationIsSupported(settings);
const auto & column_object = assert_cast<const ColumnObjectDeprecated &>(column);
auto * state_object = checkAndGetState<SerializeStateObject>(state);
if (!column_object.isFinalized())
{
auto finalized = column_object.cloneFinalized();
serializeBinaryBulkWithMultipleStreams(*finalized, offset, limit, settings, state);
return;
}
auto [tuple_column, tuple_type] = unflattenObjectToTuple(column_object);
if (!state_object->nested_type->equals(*tuple_type))
{
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR,
"Types of internal column of Object mismatched. Expected: {}, Got: {}",
state_object->nested_type->getName(), tuple_type->getName());
}
settings.path.push_back(Substream::DeprecatedObjectData);
if (auto * stream = settings.getter(settings.path))
{
state_object->nested_serialization->serializeBinaryBulkWithMultipleStreams(
*tuple_column, offset, limit, settings, state_object->nested_state);
}
settings.path.pop_back();
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
checkSerializationIsSupported(settings);
if (!column->empty())
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"DataTypeObject cannot be deserialized to non-empty column");
auto mutable_column = column->assumeMutable();
auto & column_object = assert_cast<ColumnObjectDeprecated &>(*mutable_column);
auto * state_object = checkAndGetState<DeserializeStateObject>(state);
settings.path.push_back(Substream::DeprecatedObjectData);
if (state_object->kind == BinarySerializationKind::STRING)
deserializeBinaryBulkFromString(column_object, limit, settings, *state_object, cache);
else
deserializeBinaryBulkFromTuple(column_object, limit, settings, *state_object, cache);
settings.path.pop_back();
column_object.checkConsistency();
column_object.finalize();
column = std::move(mutable_column);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeBinaryBulkFromString(
ColumnObjectDeprecated & column_object,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeStateObject & state,
SubstreamsCache * cache) const
{
ColumnPtr column_string = state.nested_type->createColumn();
state.nested_serialization->deserializeBinaryBulkWithMultipleStreams(
column_string, limit, settings, state.nested_state, cache);
size_t input_rows_count = column_string->size();
column_object.reserve(input_rows_count);
FormatSettings format_settings;
for (size_t i = 0; i < input_rows_count; ++i)
{
const auto & val = column_string->getDataAt(i);
ReadBufferFromMemory read_buffer(val.data, val.size);
deserializeWholeText(column_object, read_buffer, format_settings);
if (!read_buffer.eof())
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT,
"Cannot parse string to column Object. Expected eof");
}
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeBinaryBulkFromTuple(
ColumnObjectDeprecated & column_object,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeStateObject & state,
SubstreamsCache * cache) const
{
ColumnPtr column_tuple = state.nested_type->createColumn();
state.nested_serialization->deserializeBinaryBulkWithMultipleStreams(
column_tuple, limit, settings, state.nested_state, cache);
auto [tuple_paths, tuple_types] = flattenTuple(state.nested_type);
auto flattened_tuple = flattenTuple(column_tuple);
const auto & tuple_columns = assert_cast<const ColumnTuple &>(*flattened_tuple).getColumns();
assert(tuple_paths.size() == tuple_types.size());
size_t num_subcolumns = tuple_paths.size();
if (tuple_columns.size() != num_subcolumns)
throw Exception(ErrorCodes::INCORRECT_DATA,
"Inconsistent type ({}) and column ({}) while reading column of type Object",
state.nested_type->getName(), column_tuple->getName());
for (size_t i = 0; i < num_subcolumns; ++i)
column_object.addSubcolumn(tuple_paths[i], tuple_columns[i]->assumeMutable());
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeBinary(const Field &, WriteBuffer &, const FormatSettings &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObjectDeprecated");
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeBinary(Field &, ReadBuffer &, const FormatSettings &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObjectDeprecated");
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeBinary(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObjectDeprecated");
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::deserializeBinary(IColumn &, ReadBuffer &, const FormatSettings &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObjectDeprecated");
}
/// TODO: use format different of JSON in serializations.
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const auto & column_object = assert_cast<const ColumnObjectDeprecated &>(column);
const auto & subcolumns = column_object.getSubcolumns();
writeChar('{', ostr);
for (auto it = subcolumns.begin(); it != subcolumns.end(); ++it)
{
const auto & entry = *it;
if (it != subcolumns.begin())
writeCString(",", ostr);
writeDoubleQuoted(entry->path.getPath(), ostr);
writeChar(':', ostr);
serializeTextFromSubcolumn(entry->data, row_num, ostr, settings);
}
writeChar('}', ostr);
}
template <typename Parser>
template <bool pretty_json>
void SerializationObjectDeprecated<Parser>::serializeTextFromSubcolumn(
const ColumnObjectDeprecated::Subcolumn & subcolumn, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const
{
const auto & least_common_type = subcolumn.getLeastCommonType();
if (subcolumn.isFinalized())
{
const auto & finalized_column = subcolumn.getFinalizedColumn();
auto info = least_common_type->getSerializationInfo(finalized_column);
auto serialization = least_common_type->getSerialization(*info);
if constexpr (pretty_json)
serialization->serializeTextJSONPretty(finalized_column, row_num, ostr, settings, indent);
else
serialization->serializeTextJSON(finalized_column, row_num, ostr, settings);
return;
}
size_t ind = row_num;
if (ind < subcolumn.getNumberOfDefaultsInPrefix())
{
/// Suboptimal, but it should happen rarely.
auto tmp_column = subcolumn.getLeastCommonType()->createColumn();
tmp_column->insertDefault();
auto info = least_common_type->getSerializationInfo(*tmp_column);
auto serialization = least_common_type->getSerialization(*info);
if constexpr (pretty_json)
serialization->serializeTextJSONPretty(*tmp_column, 0, ostr, settings, indent);
else
serialization->serializeTextJSON(*tmp_column, 0, ostr, settings);
return;
}
ind -= subcolumn.getNumberOfDefaultsInPrefix();
for (const auto & part : subcolumn.getData())
{
if (ind < part->size())
{
auto part_type = getDataTypeByColumn(*part);
auto info = part_type->getSerializationInfo(*part);
auto serialization = part_type->getSerialization(*info);
if constexpr (pretty_json)
serialization->serializeTextJSONPretty(*part, ind, ostr, settings, indent);
else
serialization->serializeTextJSON(*part, ind, ostr, settings);
return;
}
ind -= part->size();
}
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Index ({}) for text serialization is out of range", row_num);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeTextImpl(column, row_num, ostr, settings);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString ostr_str;
serializeTextImpl(column, row_num, ostr_str, settings);
writeEscapedString(ostr_str.str(), ostr);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString ostr_str;
serializeTextImpl(column, row_num, ostr_str, settings);
writeQuotedString(ostr_str.str(), ostr);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeTextImpl(column, row_num, ostr, settings);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString ostr_str;
serializeTextImpl(column, row_num, ostr_str, settings);
writeCSVString(ostr_str.str(), ostr);
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextMarkdown(
const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
if (settings.markdown.escape_special_characters)
{
WriteBufferFromOwnString ostr_str;
serializeTextImpl(column, row_num, ostr_str, settings);
writeMarkdownEscapedString(ostr_str.str(), ostr);
}
else
{
serializeTextEscaped(column, row_num, ostr, settings);
}
}
template <typename Parser>
void SerializationObjectDeprecated<Parser>::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const
{
const auto & column_object = assert_cast<const ColumnObjectDeprecated &>(column);
const auto & subcolumns = column_object.getSubcolumns();
writeCString("{\n", ostr);
for (auto it = subcolumns.begin(); it != subcolumns.end(); ++it)
{
const auto & entry = *it;
if (it != subcolumns.begin())
writeCString(",\n", ostr);
writeChar(' ', (indent + 1) * 4, ostr);
writeDoubleQuoted(entry->path.getPath(), ostr);
writeCString(": ", ostr);
serializeTextFromSubcolumn<true>(entry->data, row_num, ostr, settings, indent + 1);
}
writeChar('\n', ostr);
writeChar(' ', indent * 4, ostr);
writeChar('}', ostr);
}
SerializationPtr getObjectSerialization(const String & schema_format)
{
if (schema_format == "json")
{
#if USE_SIMDJSON
return std::make_shared<SerializationObjectDeprecated<JSONDataParser<SimdJSONParser>>>();
#elif USE_RAPIDJSON
return std::make_shared<SerializationObjectDeprecated<JSONDataParser<RapidJSONParser>>>();
#else
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"To use data type Object with JSON format ClickHouse should be built with Simdjson or Rapidjson");
#endif
}
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unknown schema format '{}'", schema_format);
}
}

View File

@ -0,0 +1,121 @@
#pragma once
#include <Columns/ColumnObjectDeprecated.h>
#include <DataTypes/Serializations/SimpleTextSerialization.h>
#include <Common/ObjectPool.h>
namespace DB
{
/** Serialization for data type Object (deprecated).
* Supported only text serialization/deserialization.
* and binary bulk serialization/deserialization without position independent
* encoding, i.e. serialization/deserialization into Native format.
*/
template <typename Parser>
class SerializationObjectDeprecated : public ISerialization
{
public:
/** In Native format ColumnObjectDeprecated can be serialized
* in two formats: as Tuple or as String.
* The format is the following:
*
* <serialization_kind> 1 byte -- 0 if Tuple, 1 if String.
* [type_name] -- Only for tuple serialization.
* ... data of internal column ...
*
* ClickHouse client serializazes objects as tuples.
* String serialization exists for clients, which cannot
* do parsing by themselves and they can send raw data as
* string. It will be parsed on the server side.
*/
void serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override;
void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override;
void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override;
void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override;
void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
private:
enum class BinarySerializationKind : UInt8
{
TUPLE = 0,
STRING = 1,
};
struct SerializeStateObject;
struct DeserializeStateObject;
void deserializeBinaryBulkFromString(
ColumnObjectDeprecated & column_object,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeStateObject & state,
SubstreamsCache * cache) const;
void deserializeBinaryBulkFromTuple(
ColumnObjectDeprecated & column_object,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeStateObject & state,
SubstreamsCache * cache) const;
template <typename TSettings>
void checkSerializationIsSupported(const TSettings & settings) const;
template <typename Reader>
void deserializeTextImpl(IColumn & column, Reader && reader) const;
void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const;
template <bool pretty_json = false>
void serializeTextFromSubcolumn(const ColumnObjectDeprecated::Subcolumn & subcolumn, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent = 0) const;
/// Pool of parser objects to make SerializationObjectDeprecated thread safe.
mutable SimpleObjectPool<Parser> parsers_pool;
};
SerializationPtr getObjectSerialization(const String & schema_format);
}

View File

@ -0,0 +1,192 @@
#include <Columns/ColumnDynamic.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/Serializations/SerializationObject.h>
#include <DataTypes/Serializations/SerializationObjectDynamicPath.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
SerializationObjectDynamicPath::SerializationObjectDynamicPath(
const DB::SerializationPtr & nested_, const String & path_, const String & path_subcolumn_, size_t max_dynamic_types_)
: SerializationWrapper(nested_)
, path(path_)
, path_subcolumn(path_subcolumn_)
, dynamic_serialization(std::make_shared<SerializationDynamic>())
, shared_data_serialization(SerializationObject::getTypeOfSharedData()->getDefaultSerialization())
, max_dynamic_types(max_dynamic_types_)
{
}
struct DeserializeBinaryBulkStateObjectDynamicPath : public ISerialization::DeserializeBinaryBulkState
{
ISerialization::DeserializeBinaryBulkStatePtr structure_state;
ISerialization::DeserializeBinaryBulkStatePtr nested_state;
bool read_from_shared_data;
ColumnPtr shared_data;
};
void SerializationObjectDynamicPath::enumerateStreams(
DB::ISerialization::EnumerateStreamsSettings & settings,
const DB::ISerialization::StreamCallback & callback,
const DB::ISerialization::SubstreamData & data) const
{
settings.path.push_back(Substream::ObjectStructure);
callback(settings.path);
settings.path.pop_back();
const auto * deserialize_state = data.deserialize_state ? checkAndGetState<DeserializeBinaryBulkStateObjectDynamicPath>(data.deserialize_state) : nullptr;
/// We cannot enumerate anything if we don't have deserialization state, as we don't know the dynamic structure.
if (!deserialize_state)
return;
settings.path.push_back(Substream::ObjectData);
const auto * structure_state = checkAndGetState<SerializationObject::DeserializeBinaryBulkStateObjectStructure>(deserialize_state->structure_state);
/// Check if we have our path in dynamic paths.
if (structure_state->dynamic_paths.contains(path))
{
settings.path.push_back(Substream::ObjectDynamicPath);
settings.path.back().object_path_name = path;
auto path_data = SubstreamData(nested_serialization)
.withType(data.type)
.withColumn(data.column)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state->nested_state);
settings.path.back().data = path_data;
nested_serialization->enumerateStreams(settings, callback, path_data);
settings.path.pop_back();
}
/// Otherwise we will have to read all shared data and try to find our path there.
else
{
settings.path.push_back(Substream::ObjectSharedData);
auto shared_data_substream_data = SubstreamData(shared_data_serialization)
.withType(data.type ? SerializationObject::getTypeOfSharedData() : nullptr)
.withColumn(data.column ? SerializationObject::getTypeOfSharedData()->createColumn() : nullptr)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state->nested_state);
settings.path.back().data = shared_data_substream_data;
shared_data_serialization->enumerateStreams(settings, callback, shared_data_substream_data);
settings.path.pop_back();
}
settings.path.pop_back();
}
void SerializationObjectDynamicPath::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationObjectDynamicPath");
}
void SerializationObjectDynamicPath::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationObjectDynamicPath");
}
void SerializationObjectDynamicPath::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const
{
auto structure_state = SerializationObject::deserializeObjectStructureStatePrefix(settings, cache);
if (!structure_state)
return;
auto dynamic_path_state = std::make_shared<DeserializeBinaryBulkStateObjectDynamicPath>();
dynamic_path_state->structure_state = std::move(structure_state);
/// Remember if we need to read from shared data or we have this path in dynamic paths.
dynamic_path_state->read_from_shared_data = !checkAndGetState<SerializationObject::DeserializeBinaryBulkStateObjectStructure>(dynamic_path_state->structure_state)->dynamic_paths.contains(path);
settings.path.push_back(Substream::ObjectData);
if (dynamic_path_state->read_from_shared_data)
{
settings.path.push_back(Substream::ObjectSharedData);
shared_data_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_path_state->nested_state, cache);
settings.path.pop_back();
}
else
{
settings.path.push_back(Substream::ObjectDynamicPath);
settings.path.back().object_path_name = path;
nested_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_path_state->nested_state, cache);
settings.path.pop_back();
}
settings.path.pop_back();
state = std::move(dynamic_path_state);
}
void SerializationObjectDynamicPath::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationObjectDynamicPath");
}
void SerializationObjectDynamicPath::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & result_column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
if (!state)
return;
auto * dynamic_path_state = checkAndGetState<DeserializeBinaryBulkStateObjectDynamicPath>(state);
settings.path.push_back(Substream::ObjectData);
/// Check if we don't need to read shared data. In this case just read data from dynamic path.
if (!dynamic_path_state->read_from_shared_data)
{
settings.path.push_back(Substream::ObjectDynamicPath);
settings.path.back().object_path_name = path;
nested_serialization->deserializeBinaryBulkWithMultipleStreams(result_column, limit, settings, dynamic_path_state->nested_state, cache);
settings.path.pop_back();
}
/// Otherwise, read the whole shared data column and extract requested path from it.
/// TODO: We can read several subcolumns of the same path located in the shared data
/// and right now we extract the whole path column from shared data every time
/// and then extract the requested subcolumns. We can optimize it and use substreams
/// cache here to avoid extracting the same path from shared data several times.
///
/// TODO: We can change the serialization of shared data to optimize reading paths from it.
/// Right now we cannot know if shared data contains our path in current range or not,
/// but we can change the serialization and write the list of all paths stored in shared
/// data before each granule, and then replace the column that stores paths with column
/// with indexes in this list. It can also reduce the storage, because we will store
/// each path only once and can replace UInt64 string offset column with indexes column
/// that can have smaller type depending on the number of paths in the list.
else
{
settings.path.push_back(Substream::ObjectSharedData);
/// Initialize shared_data column if needed.
if (result_column->empty())
dynamic_path_state->shared_data = SerializationObject::getTypeOfSharedData()->createColumn();
size_t prev_size = result_column->size();
shared_data_serialization->deserializeBinaryBulkWithMultipleStreams(dynamic_path_state->shared_data, limit, settings, dynamic_path_state->nested_state, cache);
/// If we need to read a subcolumn from Dynamic column, create an empty Dynamic column, fill it and extract subcolumn.
MutableColumnPtr dynamic_column = path_subcolumn.empty() ? result_column->assumeMutable() : ColumnDynamic::create(max_dynamic_types)->getPtr();
/// Check if we don't have any paths in shared data in current range.
const auto & offsets = assert_cast<const ColumnArray &>(*dynamic_path_state->shared_data).getOffsets();
if (offsets.back() == offsets[ssize_t(prev_size) - 1])
dynamic_column->insertManyDefaults(limit);
else
ColumnObject::fillPathColumnFromSharedData(*dynamic_column, path, dynamic_path_state->shared_data, prev_size, dynamic_path_state->shared_data->size());
/// Extract subcolumn from Dynamic column if needed.
if (!path_subcolumn.empty())
{
auto subcolumn = std::make_shared<DataTypeDynamic>(max_dynamic_types)->getSubcolumn(path_subcolumn, dynamic_column->getPtr());
result_column->assumeMutable()->insertRangeFrom(*subcolumn, 0, subcolumn->size());
}
settings.path.pop_back();
}
settings.path.pop_back();
}
}

View File

@ -0,0 +1,58 @@
#pragma once
#include <DataTypes/Serializations/SerializationWrapper.h>
namespace DB
{
/// Serialization of dynamic Object paths.
/// For example, if we have type JSON(a.b UInt32, b.c String) and data {"a" : {"b" : 42}, "b" : {"c" : "Hello}, "c" : {"d" : [1, 2, 3]}, "d" : 42}
/// this class will be responsible for reading dynamic paths 'c.d' and 'd' as subcolumns.
/// Typed paths 'a.b' and 'b.c' are serialized in SerializationObjectTypedPath.
class SerializationObjectDynamicPath final : public SerializationWrapper
{
public:
SerializationObjectDynamicPath(const SerializationPtr & nested_, const String & path_, const String & path_subcolumn_, size_t max_dynamic_types_);
void enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
private:
String path;
String path_subcolumn;
SerializationPtr dynamic_serialization;
SerializationPtr shared_data_serialization;
size_t max_dynamic_types;
};
}

View File

@ -0,0 +1,78 @@
#include <Columns/ColumnDynamic.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/Serializations/SerializationObject.h>
#include <DataTypes/Serializations/SerializationObjectTypedPath.h>
#include <IO/ReadHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
void SerializationObjectTypedPath::enumerateStreams(
DB::ISerialization::EnumerateStreamsSettings & settings,
const DB::ISerialization::StreamCallback & callback,
const DB::ISerialization::SubstreamData & data) const
{
settings.path.push_back(Substream::ObjectData);
settings.path.push_back(Substream::ObjectTypedPath);
settings.path.back().object_path_name = path;
auto path_data = SubstreamData(nested_serialization)
.withType(data.type)
.withColumn(data.column)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(data.deserialize_state);
nested_serialization->enumerateStreams(settings, callback, path_data);
settings.path.pop_back();
settings.path.pop_back();
}
void SerializationObjectTypedPath::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationObjectTypedPath");
}
void SerializationObjectTypedPath::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationObjectTypedPath");
}
void SerializationObjectTypedPath::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const
{
settings.path.push_back(Substream::ObjectData);
settings.path.push_back(Substream::ObjectTypedPath);
settings.path.back().object_path_name = path;
nested_serialization->deserializeBinaryBulkStatePrefix(settings, state, cache);
settings.path.pop_back();
settings.path.pop_back();
}
void SerializationObjectTypedPath::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationObjectTypedPath");
}
void SerializationObjectTypedPath::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & result_column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
settings.path.push_back(Substream::ObjectData);
settings.path.push_back(Substream::ObjectTypedPath);
settings.path.back().object_path_name = path;
nested_serialization->deserializeBinaryBulkWithMultipleStreams(result_column, limit, settings, state, cache);
settings.path.pop_back();
settings.path.pop_back();
}
}

View File

@ -0,0 +1,57 @@
#pragma once
#include <DataTypes/Serializations/SerializationWrapper.h>
namespace DB
{
/// Serialization of typed Object paths.
/// For example, for type JSON(a.b UInt32, b.c String) this serialization
/// will be used to read paths 'a.b' and 'b.c' as subcolumns.
class SerializationObjectTypedPath final : public SerializationWrapper
{
public:
SerializationObjectTypedPath(const SerializationPtr & nested_, const String & path_)
: SerializationWrapper(nested_)
, path(path_)
{
}
void enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
private:
String path;
};
}

View File

@ -0,0 +1,259 @@
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/Serializations/SerializationObject.h>
#include <DataTypes/Serializations/SerializationSubObject.h>
#include <Common/logger_useful.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
SerializationSubObject::SerializationSubObject(
const String & path_prefix_, const std::unordered_map<String, SerializationPtr> & typed_paths_serializations_)
: path_prefix(path_prefix_)
, typed_paths_serializations(typed_paths_serializations_)
, dynamic_serialization(std::make_shared<SerializationDynamic>())
, shared_data_serialization(SerializationObject::getTypeOfSharedData()->getDefaultSerialization())
{
}
struct DeserializeBinaryBulkStateSubObject : public ISerialization::DeserializeBinaryBulkState
{
std::unordered_map<String, ISerialization::DeserializeBinaryBulkStatePtr> typed_path_states;
std::unordered_map<String, ISerialization::DeserializeBinaryBulkStatePtr> dynamic_path_states;
std::vector<String> dynamic_paths;
std::vector<String> dynamic_sub_paths;
ISerialization::DeserializeBinaryBulkStatePtr shared_data_state;
ColumnPtr shared_data;
};
void SerializationSubObject::enumerateStreams(
DB::ISerialization::EnumerateStreamsSettings & settings,
const DB::ISerialization::StreamCallback & callback,
const DB::ISerialization::SubstreamData & data) const
{
settings.path.push_back(Substream::ObjectStructure);
callback(settings.path);
settings.path.pop_back();
const auto * column_object = data.column ? &assert_cast<const ColumnObject &>(*data.column) : nullptr;
const auto * type_object = data.type ? &assert_cast<const DataTypeObject &>(*data.type) : nullptr;
const auto * deserialize_state = data.deserialize_state ? checkAndGetState<DeserializeBinaryBulkStateSubObject>(data.deserialize_state) : nullptr;
settings.path.push_back(Substream::ObjectData);
/// typed_paths_serializations contains only typed paths with requested prefix from original Object column.
for (const auto & [path, serialization] : typed_paths_serializations)
{
settings.path.push_back(Substream::ObjectTypedPath);
settings.path.back().object_path_name = path;
auto path_data = SubstreamData(serialization)
.withType(type_object ? type_object->getTypedPaths().at(path.substr(path_prefix.size() + 1)) : nullptr)
.withColumn(column_object ? column_object->getTypedPaths().at(path.substr(path_prefix.size() + 1)) : nullptr)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state ? deserialize_state->typed_path_states.at(path) : nullptr);
settings.path.back().data = path_data;
serialization->enumerateStreams(settings, callback, path_data);
settings.path.pop_back();
}
/// We will need to read shared data to find all paths with requested prefix.
settings.path.push_back(Substream::ObjectSharedData);
auto shared_data_substream_data = SubstreamData(shared_data_serialization)
.withType(data.type ? SerializationObject::getTypeOfSharedData() : nullptr)
.withColumn(data.column ? SerializationObject::getTypeOfSharedData()->createColumn() : nullptr)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(deserialize_state ? deserialize_state->shared_data_state : nullptr);
settings.path.back().data = shared_data_substream_data;
shared_data_serialization->enumerateStreams(settings, callback, shared_data_substream_data);
settings.path.pop_back();
/// If deserialize state is provided, enumerate streams for dynamic paths.
if (deserialize_state)
{
DataTypePtr type = std::make_shared<DataTypeDynamic>();
for (const auto & [path, state] : deserialize_state->dynamic_path_states)
{
settings.path.push_back(Substream::ObjectDynamicPath);
settings.path.back().object_path_name = path;
auto path_data = SubstreamData(dynamic_serialization)
.withType(type_object ? type : nullptr)
.withColumn(nullptr)
.withSerializationInfo(data.serialization_info)
.withDeserializeState(state);
settings.path.back().data = path_data;
dynamic_serialization->enumerateStreams(settings, callback, path_data);
settings.path.pop_back();
}
}
settings.path.pop_back();
}
void SerializationSubObject::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationSubObject");
}
void SerializationSubObject::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationSubObject");
}
namespace
{
/// Return sub-path by specified prefix.
/// For example, for prefix a.b:
/// a.b.c.d -> c.d, a.b.c -> c
String getSubPath(const String & path, const String & prefix)
{
return path.substr(prefix.size() + 1);
}
std::string_view getSubPath(const std::string_view & path, const String & prefix)
{
return path.substr(prefix.size() + 1);
}
}
void SerializationSubObject::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const
{
auto structure_state = SerializationObject::deserializeObjectStructureStatePrefix(settings, cache);
if (!structure_state)
return;
auto sub_object_state = std::make_shared<DeserializeBinaryBulkStateSubObject>();
settings.path.push_back(Substream::ObjectData);
for (const auto & [path, serialization] : typed_paths_serializations)
{
settings.path.push_back(Substream::ObjectTypedPath);
settings.path.back().object_path_name = path;
serialization->deserializeBinaryBulkStatePrefix(settings, sub_object_state->typed_path_states[path], cache);
settings.path.pop_back();
}
for (const auto & dynamic_path : checkAndGetState<SerializationObject::DeserializeBinaryBulkStateObjectStructure>(structure_state)->sorted_dynamic_paths)
{
/// Save only dynamic paths with requested prefix.
if (dynamic_path.starts_with(path_prefix) && dynamic_path.size() != path_prefix.size())
{
settings.path.push_back(Substream::ObjectDynamicPath);
settings.path.back().object_path_name = dynamic_path;
dynamic_serialization->deserializeBinaryBulkStatePrefix(settings, sub_object_state->dynamic_path_states[dynamic_path], cache);
settings.path.pop_back();
sub_object_state->dynamic_paths.push_back(dynamic_path);
sub_object_state->dynamic_sub_paths.push_back(getSubPath(dynamic_path, path_prefix));
}
}
settings.path.push_back(Substream::ObjectSharedData);
shared_data_serialization->deserializeBinaryBulkStatePrefix(settings, sub_object_state->shared_data_state, cache);
settings.path.pop_back();
settings.path.pop_back();
state = std::move(sub_object_state);
}
void SerializationSubObject::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationSubObject");
}
void SerializationSubObject::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & result_column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
if (!state)
return;
auto * sub_object_state = checkAndGetState<DeserializeBinaryBulkStateSubObject>(state);
auto mutable_column = result_column->assumeMutable();
auto & column_object = assert_cast<ColumnObject &>(*mutable_column);
/// If it's a new object column, set dynamic paths and statistics.
if (column_object.empty())
column_object.setDynamicPaths(sub_object_state->dynamic_sub_paths);
auto & typed_paths = column_object.getTypedPaths();
auto & dynamic_paths = column_object.getDynamicPaths();
settings.path.push_back(Substream::ObjectData);
for (const auto & [path, serialization] : typed_paths_serializations)
{
settings.path.push_back(Substream::ObjectTypedPath);
settings.path.back().object_path_name = path;
serialization->deserializeBinaryBulkWithMultipleStreams(typed_paths[getSubPath(path, path_prefix)], limit, settings, sub_object_state->typed_path_states[path], cache);
settings.path.pop_back();
}
for (const auto & path : sub_object_state->dynamic_paths)
{
settings.path.push_back(Substream::ObjectDynamicPath);
settings.path.back().object_path_name = path;
dynamic_serialization->deserializeBinaryBulkWithMultipleStreams(dynamic_paths[getSubPath(path, path_prefix)], limit, settings, sub_object_state->dynamic_path_states[path], cache);
settings.path.pop_back();
}
settings.path.push_back(Substream::ObjectSharedData);
/// If it's a new object column, reinitialize column for shared data.
if (result_column->empty())
sub_object_state->shared_data = SerializationObject::getTypeOfSharedData()->createColumn();
size_t prev_size = column_object.size();
shared_data_serialization->deserializeBinaryBulkWithMultipleStreams(sub_object_state->shared_data, limit, settings, sub_object_state->shared_data_state, cache);
settings.path.pop_back();
auto & sub_object_shared_data = column_object.getSharedDataColumn();
const auto & offsets = assert_cast<const ColumnArray &>(*sub_object_state->shared_data).getOffsets();
/// Check if there is no data in shared data in current range.
if (offsets.back() == offsets[ssize_t(prev_size) - 1])
{
sub_object_shared_data.insertManyDefaults(limit);
}
else
{
const auto & shared_data_array = assert_cast<const ColumnArray &>(*sub_object_state->shared_data);
const auto & shared_data_offsets = shared_data_array.getOffsets();
const auto & shared_data_tuple = assert_cast<const ColumnTuple &>(shared_data_array.getData());
const auto & shared_data_paths = assert_cast<const ColumnString &>(shared_data_tuple.getColumn(0));
const auto & shared_data_values = assert_cast<const ColumnString &>(shared_data_tuple.getColumn(1));
auto & sub_object_data_offsets = column_object.getSharedDataOffsets();
auto [sub_object_shared_data_paths, sub_object_shared_data_values] = column_object.getSharedDataPathsAndValues();
StringRef prefix_ref(path_prefix);
for (size_t i = prev_size; i != shared_data_offsets.size(); ++i)
{
size_t start = shared_data_offsets[ssize_t(i) - 1];
size_t end = shared_data_offsets[ssize_t(i)];
size_t lower_bound_index = ColumnObject::findPathLowerBoundInSharedData(prefix_ref, shared_data_paths, start, end);
for (; lower_bound_index != end; ++lower_bound_index)
{
auto path = shared_data_paths.getDataAt(lower_bound_index).toView();
if (!path.starts_with(path_prefix))
break;
/// Don't include path that is equal to the prefix.
if (path.size() != path_prefix.size())
{
auto sub_path = getSubPath(path, path_prefix);
sub_object_shared_data_paths->insertData(sub_path.data(), sub_path.size());
sub_object_shared_data_values->insertFrom(shared_data_values, lower_bound_index);
}
}
sub_object_data_offsets.push_back(sub_object_shared_data_paths->size());
}
}
settings.path.pop_back();
}
}

View File

@ -0,0 +1,76 @@
#pragma once
#include <DataTypes/Serializations/ISerialization.h>
#include <DataTypes/Serializations/SimpleTextSerialization.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
/// Serialization of a sub-object Object subcolumns.
/// For example, if we have type JSON and data {"a" : {"b" : {"c" : 42, "d" : "Hello"}}, "c" : [1, 2, 3], "d" : 42}
/// this class will be responsible for reading sub-object a.b and will read JSON column with data {"c" : 43, "d" : "Hello"}.
class SerializationSubObject final : public SimpleTextSerialization
{
public:
SerializationSubObject(const String & path_prefix_, const std::unordered_map<String, SerializationPtr> & typed_paths_serializations_);
void enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
void serializeBinary(const Field &, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void deserializeBinary(Field &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void serializeBinary(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void deserializeBinary(IColumn &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void serializeText(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void deserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); }
bool tryDeserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); }
private:
[[noreturn]] static void throwNoSerialization()
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Text/binary serialization is not implemented for object sub-object subcolumn");
}
String path_prefix;
std::unordered_map<String, SerializationPtr> typed_paths_serializations;
SerializationPtr dynamic_serialization;
SerializationPtr shared_data_serialization;
};
}

View File

@ -218,7 +218,8 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state,
std::unordered_map<String, size_t> & variants_statistics) const
std::unordered_map<String, size_t> & variants_statistics,
size_t & total_size_of_variants) const
{
const ColumnVariant & col = assert_cast<const ColumnVariant &>(column);
if (const size_t size = col.size(); limit == 0 || offset + limit > size)
@ -265,6 +266,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
/// We can use the same offset/limit as for whole Variant column
variants[non_empty_global_discr]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(non_empty_global_discr), offset, limit, settings, variant_state->variant_states[non_empty_global_discr]);
variants_statistics[variant_names[non_empty_global_discr]] += limit;
total_size_of_variants += limit;
settings.path.pop_back();
settings.path.pop_back();
return;
@ -315,7 +317,9 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
{
addVariantElementToPath(settings.path, i);
variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->variant_states[i]);
variants_statistics[variant_names[i]] += col.getVariantByGlobalDiscriminator(i).size();
size_t variant_size = col.getVariantByGlobalDiscriminator(i).size();
variants_statistics[variant_names[i]] += variant_size;
total_size_of_variants += variant_size;
settings.path.pop_back();
}
settings.path.pop_back();
@ -386,6 +390,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
settings,
variant_state->variant_states[i]);
variants_statistics[variant_names[i]] += variant_offsets_and_limits[i].second;
total_size_of_variants += variant_offsets_and_limits[i].second;
settings.path.pop_back();
}
}
@ -400,7 +405,8 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams(
DB::ISerialization::SerializeBinaryBulkStatePtr & state) const
{
std::unordered_map<String, size_t> tmp_statistics;
serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(column, offset, limit, settings, state, tmp_statistics);
size_t tmp_size;
serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(column, offset, limit, settings, state, tmp_statistics, tmp_size);
}
void SerializationVariant::deserializeBinaryBulkWithMultipleStreams(
@ -1068,6 +1074,16 @@ void SerializationVariant::serializeTextJSON(const IColumn & column, size_t row_
variants[global_discr]->serializeTextJSON(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings);
}
void SerializationVariant::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const
{
const ColumnVariant & col = assert_cast<const ColumnVariant &>(column);
auto global_discr = col.globalDiscriminatorAt(row_num);
if (global_discr == ColumnVariant::NULL_DISCRIMINATOR)
SerializationNullable::serializeNullJSON(ostr);
else
variants[global_discr]->serializeTextJSONPretty(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings, indent);
}
bool SerializationVariant::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String field;

View File

@ -113,7 +113,8 @@ public:
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state,
std::unordered_map<String, size_t> & variants_statistics) const;
std::unordered_map<String, size_t> & variants_statistics,
size_t & total_size_of_variants) const;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
@ -145,6 +146,7 @@ public:
bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;

View File

@ -193,16 +193,6 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams(
nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, *variant_limit, settings, variant_element_state->variant_element_state, cache);
removeVariantFromPath(settings.path);
/// If nothing was deserialized when variant_limit > 0
/// it means that we don't have a stream for such sub-column.
/// It may happen during ALTER MODIFY column with Variant extension.
/// In this case we should just insert default values.
if (variant_element_state->variant->empty())
{
mutable_column->insertManyDefaults(num_new_discriminators);
return;
}
/// If there was nothing to deserialize or nothing was actually deserialized when variant_limit > 0, just insert defaults.
/// The second case means that we don't have a stream for such sub-column. It may happen during ALTER MODIFY column with Variant extension.
if (variant_limit == 0 || variant_element_state->variant->empty())

View File

@ -0,0 +1,80 @@
#include <Columns/ColumnObjectDeprecated.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeObjectDeprecated.h>
#include <DataTypes/Serializations/SerializationObjectDeprecated.h>
#include <DataTypes/Serializations/SerializationString.h>
#include <IO/ReadBufferFromString.h>
#include <IO/WriteBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <Common/FieldVisitorToString.h>
#include <gtest/gtest.h>
#if USE_SIMDJSON
using namespace DB;
TEST(SerializationObjectDeprecated, FromString)
{
WriteBufferFromOwnString out;
auto column_string = ColumnString::create();
column_string->insert(R"({"k1" : 1, "k2" : [{"k3" : "aa", "k4" : 2}, {"k3": "bb", "k4": 3}]})");
column_string->insert(R"({"k1" : 2, "k2" : [{"k3" : "cc", "k5" : 4}, {"k4": 5}, {"k4": 6}]})");
{
auto serialization = std::make_shared<SerializationString>();
ISerialization::SerializeBinaryBulkSettings settings;
ISerialization::SerializeBinaryBulkStatePtr state;
settings.position_independent_encoding = false;
settings.getter = [&out](const auto &) { return &out; };
writeIntBinary(static_cast<UInt8>(1), out);
serialization->serializeBinaryBulkStatePrefix(*column_string, settings, state);
serialization->serializeBinaryBulkWithMultipleStreams(*column_string, 0, column_string->size(), settings, state);
serialization->serializeBinaryBulkStateSuffix(settings, state);
}
auto type_object = std::make_shared<DataTypeObjectDeprecated>("json", false);
ColumnPtr result_column = type_object->createColumn();
ReadBufferFromOwnString in(out.str());
{
auto serialization = type_object->getDefaultSerialization();
ISerialization::DeserializeBinaryBulkSettings settings;
ISerialization::DeserializeBinaryBulkStatePtr state;
settings.position_independent_encoding = false;
settings.getter = [&in](const auto &) { return &in; };
serialization->deserializeBinaryBulkStatePrefix(settings, state, nullptr);
serialization->deserializeBinaryBulkWithMultipleStreams(result_column, column_string->size(), settings, state, nullptr);
}
auto & column_object = assert_cast<ColumnObjectDeprecated &>(*result_column->assumeMutable());
column_object.finalize();
ASSERT_TRUE(column_object.size() == 2);
ASSERT_TRUE(column_object.getSubcolumns().size() == 4);
auto check_subcolumn = [&](const auto & name, const auto & type_name, const std::vector<Field> & expected)
{
const auto & subcolumn = column_object.getSubcolumn(PathInData{name});
ASSERT_EQ(subcolumn.getLeastCommonType()->getName(), type_name);
const auto & data = subcolumn.getFinalizedColumn();
for (size_t i = 0; i < expected.size(); ++i)
ASSERT_EQ(
applyVisitor(FieldVisitorToString(), data[i]),
applyVisitor(FieldVisitorToString(), expected[i]));
};
check_subcolumn("k1", "Int8", {1, 2});
check_subcolumn("k2.k3", "Array(String)", {Array{"aa", "bb"}, Array{"cc", "", ""}});
check_subcolumn("k2.k4", "Array(Int8)", {Array{2, 3}, Array{0, 5, 6}});
check_subcolumn("k2.k5", "Array(Int8)", {Array{0, 0}, Array{4, 0, 0}});
}
#endif

View File

@ -1,80 +1,98 @@
#include <DataTypes/Serializations/SerializationString.h>
#include <IO/ReadBufferFromString.h>
#include <IO/WriteBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/Serializations/SerializationObject.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnString.h>
#include <Common/FieldVisitorToString.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/Serializations/SerializationObject.h>
#include <IO/ReadBufferFromString.h>
#include <gtest/gtest.h>
#if USE_SIMDJSON
using namespace DB;
TEST(SerializationObject, FromString)
TEST(ObjectSerialization, FieldBinarySerialization)
{
WriteBufferFromOwnString out;
auto column_string = ColumnString::create();
column_string->insert(R"({"k1" : 1, "k2" : [{"k3" : "aa", "k4" : 2}, {"k3": "bb", "k4": 3}]})");
column_string->insert(R"({"k1" : 2, "k2" : [{"k3" : "cc", "k5" : 4}, {"k4": 5}, {"k4": 6}]})");
{
auto serialization = std::make_shared<SerializationString>();
ISerialization::SerializeBinaryBulkSettings settings;
ISerialization::SerializeBinaryBulkStatePtr state;
settings.position_independent_encoding = false;
settings.getter = [&out](const auto &) { return &out; };
writeIntBinary(static_cast<UInt8>(1), out);
serialization->serializeBinaryBulkStatePrefix(*column_string, settings, state);
serialization->serializeBinaryBulkWithMultipleStreams(*column_string, 0, column_string->size(), settings, state);
serialization->serializeBinaryBulkStateSuffix(settings, state);
}
auto type_object = std::make_shared<DataTypeObject>("json", false);
ColumnPtr result_column = type_object->createColumn();
ReadBufferFromOwnString in(out.str());
{
auto serialization = type_object->getDefaultSerialization();
ISerialization::DeserializeBinaryBulkSettings settings;
ISerialization::DeserializeBinaryBulkStatePtr state;
settings.position_independent_encoding = false;
settings.getter = [&in](const auto &) { return &in; };
serialization->deserializeBinaryBulkStatePrefix(settings, state, nullptr);
serialization->deserializeBinaryBulkWithMultipleStreams(result_column, column_string->size(), settings, state, nullptr);
}
auto & column_object = assert_cast<ColumnObject &>(*result_column->assumeMutable());
column_object.finalize();
ASSERT_TRUE(column_object.size() == 2);
ASSERT_TRUE(column_object.getSubcolumns().size() == 4);
auto check_subcolumn = [&](const auto & name, const auto & type_name, const std::vector<Field> & expected)
{
const auto & subcolumn = column_object.getSubcolumn(PathInData{name});
ASSERT_EQ(subcolumn.getLeastCommonType()->getName(), type_name);
const auto & data = subcolumn.getFinalizedColumn();
for (size_t i = 0; i < expected.size(); ++i)
ASSERT_EQ(
applyVisitor(FieldVisitorToString(), data[i]),
applyVisitor(FieldVisitorToString(), expected[i]));
};
check_subcolumn("k1", "Int8", {1, 2});
check_subcolumn("k2.k3", "Array(String)", {Array{"aa", "bb"}, Array{"cc", "", ""}});
check_subcolumn("k2.k4", "Array(Int8)", {Array{2, 3}, Array{0, 5, 6}});
check_subcolumn("k2.k5", "Array(Int8)", {Array{0, 0}, Array{4, 0, 0}});
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, a.b UInt32, a.c Array(String))");
auto serialization = type->getDefaultSerialization();
Object object1 = Object{{"a.c", Array{"Str1", "Str2"}}, {"a.d", Field(42)}, {"a.e", Tuple{Field(43), "Str3"}}};
WriteBufferFromOwnString ostr;
serialization->serializeBinary(object1, ostr, FormatSettings());
ReadBufferFromString istr(ostr.str());
Field object2;
serialization->deserializeBinary(object2, istr, FormatSettings());
ASSERT_EQ(object1, object2.safeGet<Object>());
}
#endif
TEST(ObjectSerialization, ColumnBinarySerialization)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, a.b UInt32, a.c Array(String))");
auto serialization = type->getDefaultSerialization();
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"a.c", Array{"Str1", "Str2"}}, {"a.d", Field(42)}, {"a.e", Tuple{Field(43), "Str3"}}});
WriteBufferFromOwnString ostr1;
serialization->serializeBinary(col_object, 0, ostr1, FormatSettings());
ReadBufferFromString istr1(ostr1.str());
serialization->deserializeBinary(col_object, istr1, FormatSettings());
ASSERT_EQ(col_object[0], col_object[1]);
col_object.insert(Object{{"a.c", Array{"Str1", "Str2"}}, {"a.e", Field(42)}, {"b.d", Field(42)}, {"b.e", Tuple{Field(43), "Str3"}}, {"b.g", Field("Str4")}});
WriteBufferFromOwnString ostr2;
serialization->serializeBinary(col_object, 2, ostr2, FormatSettings());
ReadBufferFromString istr2(ostr2.str());
serialization->deserializeBinary(col_object, istr2, FormatSettings());
ASSERT_EQ(col_object[2], col_object[3]);
}
TEST(ObjectSerialization, JSONSerialization)
{
auto type = DataTypeFactory::instance().get("JSON(max_dynamic_types=10, max_dynamic_paths=2, a.b UInt32, a.c Array(String))");
auto serialization = type->getDefaultSerialization();
auto col = type->createColumn();
auto & col_object = assert_cast<ColumnObject &>(*col);
col_object.insert(Object{{"a.c", Array{"Str1", "Str2"}}, {"a.d", Field(42)}, {"a.e", Tuple{Field(43), "Str3"}}});
col_object.insert(Object{{"a.c", Array{"Str1", "Str2"}}, {"a", Tuple{Field(43), "Str3"}}, {"a.b.c", Field(42)}, {"a.b.e", Field(43)}, {"b.c.d.e", Field(42)}, {"b.c.d.g", Field(43)}, {"b.c.h.r", Field(44)}, {"c.g.h.t", Array{Field("Str"), Field("Str2")}}, {"h", Field("Str")}, {"j", Field("Str")}});
WriteBufferFromOwnString buf1;
serialization->serializeTextJSON(col_object, 1, buf1, FormatSettings());
ASSERT_EQ(buf1.str(), R"({"a":[43,"Str3"],"a":{"b":0,"b":{"c":42,"e":43},"c":["Str1","Str2"]},"b":{"c":{"d":{"e":42,"g":43},"h":{"r":44}}},"c":{"g":{"h":{"t":["Str","Str2"]}}},"h":"Str","j":"Str"})");
WriteBufferFromOwnString buf2;
serialization->serializeTextJSONPretty(col_object, 1, buf2, FormatSettings(), 0);
ASSERT_EQ(buf2.str(), R"({
"a" : [
43,
"Str3"
],
"a" : {
"b" : 0,
"b" : {
"c" : 42,
"e" : 43
},
"c" : [
"Str1",
"Str2"
]
},
"b" : {
"c" : {
"d" : {
"e" : 42,
"g" : 43
},
"h" : {
"r" : 44
}
}
},
"c" : {
"g" : {
"h" : {
"t" : [
"Str",
"Str2"
]
}
}
},
"h" : "Str",
"j" : "Str"
})");
}

Some files were not shown because too many files have changed in this diff Show More