Merge pull request #63798 from Blargian/variant_inference

setting to use Variant data type during schema inference
This commit is contained in:
Kruglov Pavel 2024-08-21 13:06:00 +00:00 committed by GitHub
commit aa96fd3385
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 117 additions and 1 deletions

View File

@ -194,6 +194,17 @@ If enabled, ClickHouse will try to infer type `DateTime64` from string fields in
Enabled by default.
## input_format_try_infer_variants {#input_format_try_infer_variants}
If enabled, ClickHouse will try to infer type [`Variant`](../../sql-reference/data-types/variant.md) in schema inference for text formats when there is more than one possible type for column/array elements.
Possible values:
- 0 — Disabled.
- 1 — Enabled.
Default value: `0`.
## date_time_input_format {#date_time_input_format}
Allows choosing a parser of the text representation of date and time.

View File

@ -1136,6 +1136,7 @@ class IColumn;
M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \
M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \
M(Bool, input_format_try_infer_variants, false, "Try to infer the Variant type in text formats when there is more than one possible type for column/array elements", 0) \
M(Bool, type_json_skip_duplicated_paths, false, "When enabled, during parsing JSON object into JSON type duplicated paths will be ignored and only the first one will be inserted instead of an exception", 0) \
M(UInt64, input_format_json_max_depth, 1000, "Maximum depth of a field in JSON. This is not a strict limit, it does not have to be applied precisely.", 0) \
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \

View File

@ -71,6 +71,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
},
{"24.9",
{
{"input_format_try_infer_variants", false, false, "Try to infer Variant type in text formats when there is more than one possible type for column/array elements"},
}
},
{"24.8",

View File

@ -283,6 +283,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.max_parser_depth = context->getSettingsRef().max_parser_depth;
format_settings.client_protocol_version = context->getClientProtocolVersion();
format_settings.date_time_overflow_behavior = settings.date_time_overflow_behavior;
format_settings.try_infer_variant = settings.input_format_try_infer_variants;
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
if (format_settings.schema.is_server)

View File

@ -35,6 +35,7 @@ struct FormatSettings
bool decimal_trailing_zeros = false;
bool defaults_for_omitted_fields = true;
bool is_writing_to_terminal = false;
bool try_infer_variant = false;
bool seekable_read = true;
UInt64 max_rows_to_read_for_schema_inference = 25000;

View File

@ -7,6 +7,7 @@
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeVariant.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeNothing.h>
@ -306,6 +307,33 @@ namespace
type_indexes.erase(TypeIndex::UInt64);
}
/// if setting 'try_infer_variant' is true then we convert to type variant.
void transformVariant(DataTypes & data_types, TypeIndexesSet & type_indexes)
{
if (checkIfTypesAreEqual(data_types))
return;
DataTypes variant_types;
for (const auto & type : data_types)
{
if (const auto * variant_type = typeid_cast<const DataTypeVariant *>(type.get()))
{
const auto & current_variants = variant_type->getVariants();
variant_types.insert(variant_types.end(), current_variants.begin(), current_variants.end());
}
else
{
variant_types.push_back(type);
}
}
auto variant_type = std::make_shared<DataTypeVariant>(variant_types);
for (auto & type : data_types)
type = variant_type;
type_indexes = {TypeIndex::Variant};
}
/// If we have only date/datetimes types (Date/DateTime/DateTime64), convert all of them to the common type,
/// otherwise, convert all Date, DateTime and DateTime64 to String.
void transformDatesAndDateTimes(DataTypes & data_types, TypeIndexesSet & type_indexes)
@ -652,7 +680,11 @@ namespace
transformDatesAndDateTimes(data_types, type_indexes);
if constexpr (!is_json)
{
if (settings.try_infer_variant)
transformVariant(data_types, type_indexes);
return;
}
/// Check settings specific for JSON formats.
@ -670,6 +702,10 @@ namespace
if (settings.json.try_infer_objects_as_tuples)
mergeJSONPaths(data_types, type_indexes, settings, json_info);
if (settings.try_infer_variant)
transformVariant(data_types, type_indexes);
};
auto transform_complex_types = [&](DataTypes & data_types, TypeIndexesSet & type_indexes)
@ -682,7 +718,11 @@ namespace
transformNothingComplexTypes(data_types, type_indexes);
if constexpr (!is_json)
{
if (settings.try_infer_variant)
transformVariant(data_types, type_indexes);
return;
}
/// Convert JSON tuples with same nested types to arrays.
transformTuplesWithEqualNestedTypesToArrays(data_types, type_indexes);
@ -695,6 +735,9 @@ namespace
if (json_info && json_info->allow_merging_named_tuples)
mergeNamedTuples(data_types, type_indexes, settings, json_info);
if (settings.try_infer_variant)
transformVariant(data_types, type_indexes);
};
transformTypesRecursively(types, transform_simple_types, transform_complex_types);
@ -861,7 +904,6 @@ namespace
if (checkIfTypesAreEqual(nested_types_copy))
return std::make_shared<DataTypeArray>(nested_types_copy.back());
return std::make_shared<DataTypeTuple>(nested_types);
}
else
@ -1456,6 +1498,15 @@ void transformFinalInferredJSONTypeIfNeededImpl(DataTypePtr & data_type, const F
return;
}
if (const auto * variant_type = typeid_cast<const DataTypeVariant *>(data_type.get()))
{
auto nested_types = variant_type->getVariants();
for (auto & nested_type : nested_types)
transformFinalInferredJSONTypeIfNeededImpl(nested_type, settings, json_info, remain_nothing_types);
data_type = std::make_shared<DataTypeVariant>(nested_types);
return;
}
}
void transformFinalInferredJSONTypeIfNeeded(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info)
@ -1535,6 +1586,20 @@ DataTypePtr makeNullableRecursively(DataTypePtr type)
return nested_type ? std::make_shared<DataTypeArray>(nested_type) : nullptr;
}
if (which.isVariant())
{
const auto * variant_type = assert_cast<const DataTypeVariant *>(type.get());
DataTypes nested_types;
for (const auto & nested_type: variant_type->getVariants())
{
if (!nested_type->lowCardinality() && nested_type->haveSubtypes())
nested_types.push_back(makeNullableRecursively(nested_type));
else
nested_types.push_back(nested_type);
}
return std::make_shared<DataTypeVariant>(nested_types);
}
if (which.isTuple())
{
const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get());

View File

@ -0,0 +1,31 @@
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ arr ┃ toTypeName(arr) ┃
┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
1. │ ['1','Hello',(32)] │ Array(Variant(String, Tuple(
a Nullable(Int64)))) │
└────────────────────┴──────────────────────────────────────────────────────┘
┏━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃ x ┃ toTypeName(x) ┃
┡━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
1. │ 42 │ Nullable(String) │
├───────┼──────────────────┤
2. │ Hello │ Nullable(String) │
└───────┴──────────────────┘
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ x ┃ toTypeName(x) ┃
┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
1. │ [1,2,3] │ Variant(Array(Nullable(Int64)), Tuple(
a Nullable(Int64))) │
├─────────┼───────────────────────────────────────────────────────────────┤
2. │ (42) │ Variant(Array(Nullable(Int64)), Tuple(
a Nullable(Int64))) │
└─────────┴───────────────────────────────────────────────────────────────┘
┏━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ c1 ┃ toTypeName(c1) ┃ c2 ┃ toTypeName(c2) ┃
┡━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
1. │ 1 │ Nullable(Int64) │ Hello World! │ Variant(Array(Nullable(Int64)), String) │
├────┼─────────────────┼──────────────┼─────────────────────────────────────────┤
2. │ 2 │ Nullable(Int64) │ [1,2,3] │ Variant(Array(Nullable(Int64)), String) │
├────┼─────────────────┼──────────────┼─────────────────────────────────────────┤
3. │ 3 │ Nullable(Int64) │ 2020-01-01 │ Variant(Array(Nullable(Int64)), String) │
└────┴─────────────────┴──────────────┴─────────────────────────────────────────┘

View File

@ -0,0 +1,5 @@
SET input_format_try_infer_variants=1;
SELECT arr, toTypeName(arr) FROM format('JSONEachRow', '{"arr" : [1, "Hello", {"a" : 32}]}') FORMAT Pretty;
SELECT x, toTypeName(x) FROM format('JSONEachRow', '{"x" : 42}, {"x" : "Hello"}') FORMAT Pretty;
SELECT x, toTypeName(x) FROM format('JSONEachRow', '{"x" : [1, 2, 3]}, {"x" : {"a" : 42}}') FORMAT Pretty;
SELECT c1, toTypeName(c1), c2, toTypeName(c2) FROM format('CSV', '1,Hello World!\n2,"[1,2,3]"\n3,"2020-01-01"\n') FORMAT Pretty;