From cfa510ea0ac324fc9c279f3f4afcb621104541d9 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 23 Oct 2023 14:38:34 +0000 Subject: [PATCH] Add more documentation, fix build --- docs/en/interfaces/schema-inference.md | 99 +++++++++++++++++++++++++ docs/en/operations/settings/settings.md | 3 + src/Storages/StorageS3.cpp | 12 +-- 3 files changed, 108 insertions(+), 6 deletions(-) diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index 0aadb09730a..f361bd6cdb2 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -1846,3 +1846,102 @@ DESC format(JSONAsString, '{"x" : 42, "y" : "Hello, World!"}') SETTINGS allow_ex │ json │ Object('json') │ │ │ │ │ │ └──────┴────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` + +## Schema inference modes {#schema-inference-modes} + +Schema inference from the set of data files can work in 2 different modes: `default` and `union`. +The mode is controlled by the setting `schema_inference_mode`. + +### Default mode {#default-schema-inference-mode} + +In default mode, ClickHouse assumes that all files have the same schema and tries to infer the schema by reading files one by one until it succeeds. + +Example: + +Let's say we have 3 files `data1.jsonl`, `data2.jsonl` and `data3.jsonl` with the next content: + +`data1.jsonl`: +```json +{"field1" : 1, "field2" : null} +{"field1" : 2, "field2" : null} +{"field1" : 3, "field2" : null} +``` + +`data2.jsonl`: +```json +{"field1" : 4, "field2" : "Data4"} +{"field1" : 5, "field2" : "Data5"} +{"field1" : 6, "field2" : "Data5"} +``` + +`data3.jsonl`: +```json +{"field1" : 7, "field2" : "Data7", "field3" : [1, 2, 3]} +{"field1" : 8, "field2" : "Data8", "field3" : [4, 5, 6]} +{"field1" : 9, "field2" : "Data9", "field3" : [7, 8, 9]} +``` + +Let's try to use schema inference on these 3 files: +```sql +:) DESCRIBE file('data{1,2,3}.jsonl') SETTINGS schema_inference_mode='default' +``` + +Result: +```text +┌─name───┬─type─────────────┐ +│ field1 │ Nullable(Int64) │ +│ field2 │ Nullable(String) │ +└────────┴──────────────────┘ +``` + +As we can see, we don't have `field3` from file `data3.jsonl`. +It happens because ClickHouse first tried to infer schema from file `data1.jsonl`, failed because of only nulls for field `field2`, +and then tried to infer schema from `data2.jsonl` and succeeded, so data from file `data3.jsonl` wasn't read. + +### Union mode {#default-schema-inference-mode} + +In union mode, ClickHouse assumes that files can have different schemas, so it infer schemas of all files and then union them to the common schema. + +Let's say we have 3 files `data1.jsonl`, `data2.jsonl` and `data3.jsonl` with the next content: + +`data1.jsonl`: +```json +{"field1" : 1} +{"field1" : 2} +{"field1" : 3} +``` + +`data2.jsonl`: +```json +{"field2" : "Data4"} +{"field2" : "Data5"} +{"field2" : "Data5"} +``` + +`data3.jsonl`: +```json +{"field3" : [1, 2, 3]} +{"field3" : [4, 5, 6]} +{"field3" : [7, 8, 9]} +``` + +Let's try to use schema inference on these 3 files: +```sql +:) DESCRIBE file('data{1,2,3}.jsonl') SETTINGS schema_inference_mode='union' +``` + +Result: +```text +┌─name───┬─type───────────────────┐ +│ field1 │ Nullable(Int64) │ +│ field2 │ Nullable(String) │ +│ field3 │ Array(Nullable(Int64)) │ +└────────┴────────────────────────┘ +``` + +As we can see, we have all fields from all files. + +Note: +- As some of the files may not contain some columns from the resulting schema, union mode is supported only for formats that support reading subset of columns (like JSONEachRow, Parquet, TSVWithNames, etc) and won't work for other formats (like CSV, TSV, JSONCompactEachEow, etc). +- If ClickHouse cannot infer the schema from one of the files, the exception will be thrown. +- If you have a lot of files, reading schema from all of them can take a lot of time. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 1c8c9720121..27ac051631a 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4305,6 +4305,8 @@ Default value: `1GiB`. ## Schema Inference settings +See [schema inference](../../interfaces/schema-inference.md#schema-inference-modes) documentation for more details. + ### schema_inference_use_cache_for_file {schema_inference_use_cache_for_file} Enable schemas cache for schema inference in `file` table function. @@ -4349,6 +4351,7 @@ Default value: 2. ### schema_inference_mode {schema_inference_mode} The mode of schema inference. Possible values: `default` and `union`. +See [schema inference modes](../../interfaces/schema-inference.md#schema-inference-modes) section for more details. Default value: `default`. diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index b6df46ed589..7430ec5e65c 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -1539,7 +1539,7 @@ namespace /// In union mode, check cached columns only for current key. if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) { - StorageS3::KeysWithInfo keys = {*current_key_with_info}; + StorageS3::KeysWithInfo keys = {current_key_with_info}; if (auto columns_from_cache = tryGetColumnsFromCache(keys.begin(), keys.end())) { first = false; @@ -1587,7 +1587,7 @@ namespace auto host_and_bucket = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket; Strings sources; sources.reserve(read_keys.size()); - std::transform(read_keys.begin(), read_keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket / elem.key; }); + std::transform(read_keys.begin(), read_keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket / elem->key; }); auto cache_keys = getKeysForSchemaCache(sources, configuration.format, format_settings, getContext()); StorageS3::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } @@ -1608,9 +1608,9 @@ namespace auto get_last_mod_time = [&] { time_t last_modification_time = 0; - if (it->info) + if ((*it)->info) { - last_modification_time = it->info->last_modification_time; + last_modification_time = (*it)->info->last_modification_time; } else { @@ -1620,7 +1620,7 @@ namespace last_modification_time = S3::getObjectInfo( *configuration.client, configuration.url.bucket, - it->key, + (*it)->key, configuration.url.version_id, configuration.request_settings, /*with_metadata=*/ false, @@ -1631,7 +1631,7 @@ namespace return last_modification_time ? std::make_optional(last_modification_time) : std::nullopt; }; - String path = fs::path(configuration.url.bucket) / it->key; + String path = fs::path(configuration.url.bucket) / (*it)->key; String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / path; auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time);