mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-09 17:14:47 +00:00
Merge pull request #63798 from Blargian/variant_inference
setting to use Variant data type during schema inference
This commit is contained in:
commit
aa96fd3385
@ -194,6 +194,17 @@ If enabled, ClickHouse will try to infer type `DateTime64` from string fields in
|
||||
|
||||
Enabled by default.
|
||||
|
||||
## input_format_try_infer_variants {#input_format_try_infer_variants}
|
||||
|
||||
If enabled, ClickHouse will try to infer type [`Variant`](../../sql-reference/data-types/variant.md) in schema inference for text formats when there is more than one possible type for column/array elements.
|
||||
|
||||
Possible values:
|
||||
|
||||
- 0 — Disabled.
|
||||
- 1 — Enabled.
|
||||
|
||||
Default value: `0`.
|
||||
|
||||
## date_time_input_format {#date_time_input_format}
|
||||
|
||||
Allows choosing a parser of the text representation of date and time.
|
||||
|
@ -1136,6 +1136,7 @@ class IColumn;
|
||||
M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
|
||||
M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \
|
||||
M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \
|
||||
M(Bool, input_format_try_infer_variants, false, "Try to infer the Variant type in text formats when there is more than one possible type for column/array elements", 0) \
|
||||
M(Bool, type_json_skip_duplicated_paths, false, "When enabled, during parsing JSON object into JSON type duplicated paths will be ignored and only the first one will be inserted instead of an exception", 0) \
|
||||
M(UInt64, input_format_json_max_depth, 1000, "Maximum depth of a field in JSON. This is not a strict limit, it does not have to be applied precisely.", 0) \
|
||||
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
|
||||
|
@ -71,6 +71,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
|
||||
},
|
||||
{"24.9",
|
||||
{
|
||||
{"input_format_try_infer_variants", false, false, "Try to infer Variant type in text formats when there is more than one possible type for column/array elements"},
|
||||
}
|
||||
},
|
||||
{"24.8",
|
||||
|
@ -283,6 +283,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
|
||||
format_settings.max_parser_depth = context->getSettingsRef().max_parser_depth;
|
||||
format_settings.client_protocol_version = context->getClientProtocolVersion();
|
||||
format_settings.date_time_overflow_behavior = settings.date_time_overflow_behavior;
|
||||
format_settings.try_infer_variant = settings.input_format_try_infer_variants;
|
||||
|
||||
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
|
||||
if (format_settings.schema.is_server)
|
||||
|
@ -35,6 +35,7 @@ struct FormatSettings
|
||||
bool decimal_trailing_zeros = false;
|
||||
bool defaults_for_omitted_fields = true;
|
||||
bool is_writing_to_terminal = false;
|
||||
bool try_infer_variant = false;
|
||||
|
||||
bool seekable_read = true;
|
||||
UInt64 max_rows_to_read_for_schema_inference = 25000;
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <DataTypes/DataTypeDate.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypeVariant.h>
|
||||
#include <DataTypes/DataTypeMap.h>
|
||||
#include <DataTypes/DataTypeLowCardinality.h>
|
||||
#include <DataTypes/DataTypeNothing.h>
|
||||
@ -306,6 +307,33 @@ namespace
|
||||
type_indexes.erase(TypeIndex::UInt64);
|
||||
}
|
||||
|
||||
/// if setting 'try_infer_variant' is true then we convert to type variant.
|
||||
void transformVariant(DataTypes & data_types, TypeIndexesSet & type_indexes)
|
||||
{
|
||||
if (checkIfTypesAreEqual(data_types))
|
||||
return;
|
||||
|
||||
DataTypes variant_types;
|
||||
for (const auto & type : data_types)
|
||||
{
|
||||
if (const auto * variant_type = typeid_cast<const DataTypeVariant *>(type.get()))
|
||||
{
|
||||
const auto & current_variants = variant_type->getVariants();
|
||||
variant_types.insert(variant_types.end(), current_variants.begin(), current_variants.end());
|
||||
}
|
||||
else
|
||||
{
|
||||
variant_types.push_back(type);
|
||||
}
|
||||
}
|
||||
|
||||
auto variant_type = std::make_shared<DataTypeVariant>(variant_types);
|
||||
|
||||
for (auto & type : data_types)
|
||||
type = variant_type;
|
||||
type_indexes = {TypeIndex::Variant};
|
||||
}
|
||||
|
||||
/// If we have only date/datetimes types (Date/DateTime/DateTime64), convert all of them to the common type,
|
||||
/// otherwise, convert all Date, DateTime and DateTime64 to String.
|
||||
void transformDatesAndDateTimes(DataTypes & data_types, TypeIndexesSet & type_indexes)
|
||||
@ -652,7 +680,11 @@ namespace
|
||||
transformDatesAndDateTimes(data_types, type_indexes);
|
||||
|
||||
if constexpr (!is_json)
|
||||
{
|
||||
if (settings.try_infer_variant)
|
||||
transformVariant(data_types, type_indexes);
|
||||
return;
|
||||
}
|
||||
|
||||
/// Check settings specific for JSON formats.
|
||||
|
||||
@ -670,6 +702,10 @@ namespace
|
||||
|
||||
if (settings.json.try_infer_objects_as_tuples)
|
||||
mergeJSONPaths(data_types, type_indexes, settings, json_info);
|
||||
|
||||
if (settings.try_infer_variant)
|
||||
transformVariant(data_types, type_indexes);
|
||||
|
||||
};
|
||||
|
||||
auto transform_complex_types = [&](DataTypes & data_types, TypeIndexesSet & type_indexes)
|
||||
@ -682,7 +718,11 @@ namespace
|
||||
transformNothingComplexTypes(data_types, type_indexes);
|
||||
|
||||
if constexpr (!is_json)
|
||||
{
|
||||
if (settings.try_infer_variant)
|
||||
transformVariant(data_types, type_indexes);
|
||||
return;
|
||||
}
|
||||
|
||||
/// Convert JSON tuples with same nested types to arrays.
|
||||
transformTuplesWithEqualNestedTypesToArrays(data_types, type_indexes);
|
||||
@ -695,6 +735,9 @@ namespace
|
||||
|
||||
if (json_info && json_info->allow_merging_named_tuples)
|
||||
mergeNamedTuples(data_types, type_indexes, settings, json_info);
|
||||
|
||||
if (settings.try_infer_variant)
|
||||
transformVariant(data_types, type_indexes);
|
||||
};
|
||||
|
||||
transformTypesRecursively(types, transform_simple_types, transform_complex_types);
|
||||
@ -861,7 +904,6 @@ namespace
|
||||
|
||||
if (checkIfTypesAreEqual(nested_types_copy))
|
||||
return std::make_shared<DataTypeArray>(nested_types_copy.back());
|
||||
|
||||
return std::make_shared<DataTypeTuple>(nested_types);
|
||||
}
|
||||
else
|
||||
@ -1456,6 +1498,15 @@ void transformFinalInferredJSONTypeIfNeededImpl(DataTypePtr & data_type, const F
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (const auto * variant_type = typeid_cast<const DataTypeVariant *>(data_type.get()))
|
||||
{
|
||||
auto nested_types = variant_type->getVariants();
|
||||
for (auto & nested_type : nested_types)
|
||||
transformFinalInferredJSONTypeIfNeededImpl(nested_type, settings, json_info, remain_nothing_types);
|
||||
data_type = std::make_shared<DataTypeVariant>(nested_types);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void transformFinalInferredJSONTypeIfNeeded(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
||||
@ -1535,6 +1586,20 @@ DataTypePtr makeNullableRecursively(DataTypePtr type)
|
||||
return nested_type ? std::make_shared<DataTypeArray>(nested_type) : nullptr;
|
||||
}
|
||||
|
||||
if (which.isVariant())
|
||||
{
|
||||
const auto * variant_type = assert_cast<const DataTypeVariant *>(type.get());
|
||||
DataTypes nested_types;
|
||||
for (const auto & nested_type: variant_type->getVariants())
|
||||
{
|
||||
if (!nested_type->lowCardinality() && nested_type->haveSubtypes())
|
||||
nested_types.push_back(makeNullableRecursively(nested_type));
|
||||
else
|
||||
nested_types.push_back(nested_type);
|
||||
}
|
||||
return std::make_shared<DataTypeVariant>(nested_types);
|
||||
}
|
||||
|
||||
if (which.isTuple())
|
||||
{
|
||||
const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get());
|
||||
|
31
tests/queries/0_stateless/03150_infer_type_variant.reference
Normal file
31
tests/queries/0_stateless/03150_infer_type_variant.reference
Normal file
@ -0,0 +1,31 @@
|
||||
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ arr ┃ toTypeName(arr) ┃
|
||||
┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
1. │ ['1','Hello',(32)] │ Array(Variant(String, Tuple(
|
||||
a Nullable(Int64)))) │
|
||||
└────────────────────┴──────────────────────────────────────────────────────┘
|
||||
┏━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
|
||||
┃ x ┃ toTypeName(x) ┃
|
||||
┡━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
|
||||
1. │ 42 │ Nullable(String) │
|
||||
├───────┼──────────────────┤
|
||||
2. │ Hello │ Nullable(String) │
|
||||
└───────┴──────────────────┘
|
||||
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ x ┃ toTypeName(x) ┃
|
||||
┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
1. │ [1,2,3] │ Variant(Array(Nullable(Int64)), Tuple(
|
||||
a Nullable(Int64))) │
|
||||
├─────────┼───────────────────────────────────────────────────────────────┤
|
||||
2. │ (42) │ Variant(Array(Nullable(Int64)), Tuple(
|
||||
a Nullable(Int64))) │
|
||||
└─────────┴───────────────────────────────────────────────────────────────┘
|
||||
┏━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ c1 ┃ toTypeName(c1) ┃ c2 ┃ toTypeName(c2) ┃
|
||||
┡━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
1. │ 1 │ Nullable(Int64) │ Hello World! │ Variant(Array(Nullable(Int64)), String) │
|
||||
├────┼─────────────────┼──────────────┼─────────────────────────────────────────┤
|
||||
2. │ 2 │ Nullable(Int64) │ [1,2,3] │ Variant(Array(Nullable(Int64)), String) │
|
||||
├────┼─────────────────┼──────────────┼─────────────────────────────────────────┤
|
||||
3. │ 3 │ Nullable(Int64) │ 2020-01-01 │ Variant(Array(Nullable(Int64)), String) │
|
||||
└────┴─────────────────┴──────────────┴─────────────────────────────────────────┘
|
5
tests/queries/0_stateless/03150_infer_type_variant.sql
Normal file
5
tests/queries/0_stateless/03150_infer_type_variant.sql
Normal file
@ -0,0 +1,5 @@
|
||||
SET input_format_try_infer_variants=1;
|
||||
SELECT arr, toTypeName(arr) FROM format('JSONEachRow', '{"arr" : [1, "Hello", {"a" : 32}]}') FORMAT Pretty;
|
||||
SELECT x, toTypeName(x) FROM format('JSONEachRow', '{"x" : 42}, {"x" : "Hello"}') FORMAT Pretty;
|
||||
SELECT x, toTypeName(x) FROM format('JSONEachRow', '{"x" : [1, 2, 3]}, {"x" : {"a" : 42}}') FORMAT Pretty;
|
||||
SELECT c1, toTypeName(c1), c2, toTypeName(c2) FROM format('CSV', '1,Hello World!\n2,"[1,2,3]"\n3,"2020-01-01"\n') FORMAT Pretty;
|
Loading…
Reference in New Issue
Block a user