From 9cf11a210f07110676b373b864ea098583d87ff6 Mon Sep 17 00:00:00 2001 From: Blargian Date: Tue, 11 Jun 2024 11:11:06 +0200 Subject: [PATCH] Review changes --- src/Core/SettingsChangesHistory.h | 3 +- src/Formats/SchemaInferenceUtils.cpp | 73 ++++++------------- .../03150_infer_type_variant.reference | 24 +++--- 3 files changed, 35 insertions(+), 65 deletions(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 3f743ef42bf..661ecc607ba 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -92,6 +92,7 @@ static std::map sett {"hdfs_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in HDFS table engine"}, {"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"}, {"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"}, + {"input_format_try_infer_variants", 0, 0, "Try to infer Variant type in text formats when there is more than one possible type for column/array elements"}, }}, {"24.5", {{"allow_deprecated_functions", true, false, "Allow usage of deprecated functions"}, {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, @@ -103,8 +104,6 @@ static std::map sett {"prefer_external_sort_block_bytes", 0, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging."}, {"input_format_parquet_use_native_reader", false, false, "When reading Parquet files, to use native reader instead of arrow reader."}, {"input_format_force_null_for_omitted_fields", false, false, "Disable type-defaults for omitted fields when needed"}, - {"output_format_pretty_preserve_border_for_multiline_string", 1, 1, "Applies better rendering for multiline strings."}, - {"input_format_try_infer_variants", 0, 0, "Try to infer Variant type in text formats when there is more than one possible type for column/array elements"}, {"cast_string_to_dynamic_use_inference", false, false, "Add setting to allow converting String to Dynamic through parsing"}, {"allow_experimental_dynamic_type", false, false, "Add new experimental Dynamic type"}, {"azure_max_blocks_in_multipart_upload", 50000, 50000, "Maximum number of blocks in multipart upload for Azure."}, diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index a8b5d4343f5..b7c71a95b29 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -239,16 +239,6 @@ namespace return true; } - bool checkIfTypesContainVariant(const DataTypes & types) - { - for (size_t i = 0; i < types.size(); ++i) - { - if (isVariant(types[i])) - return true; - } - return false; - } - void updateTypeIndexes(DataTypes & data_types, TypeIndexesSet & type_indexes) { type_indexes.clear(); @@ -321,49 +311,28 @@ namespace /// if setting 'try_infer_variant' is true then we convert to type variant. void transformVariant(DataTypes & data_types, TypeIndexesSet & type_indexes) { - auto typesAreEqual = checkIfTypesAreEqual(data_types); - auto typesContainVariant = checkIfTypesContainVariant(data_types); - if (typesAreEqual) + if (checkIfTypesAreEqual(data_types)) return; - DataTypes new_data_types; - TypeIndexesSet new_type_indexes; - std::shared_ptr variant_type; - - /// extract the nested types of variant and make a new variant with the nested types and the other type. - /// eg. Type 1: variant, Type 2: Date -> variant. - if (typesContainVariant) + DataTypes variant_types; + for (const auto & type : data_types) { - DataTypes extracted_types; - for (size_t i=0; i(type.get())) { - if (isVariant(data_types[i])) - { - if (const auto * variant = typeid_cast(data_types[i].get())) - extracted_types = variant->getVariants(); - } - else - extracted_types.push_back(data_types[i]); + const auto & current_variants = variant_type->getVariants(); + variant_types.insert(variant_types.end(), current_variants.begin(), current_variants.end()); + } + else + { + variant_types.push_back(type); } - variant_type = std::make_shared(extracted_types); - } - else - { - variant_type = std::make_shared(data_types); } - size_t i = 0; - while (i != data_types.size()) - { - new_data_types.push_back(variant_type); - new_type_indexes.insert(TypeIndex::Variant); - i++; - } + auto variant_type = std::make_shared(variant_types); - data_types.clear(); - type_indexes.clear(); - data_types = new_data_types; - type_indexes = new_type_indexes; + for (auto & type : data_types) + type = variant_type; + type_indexes = {TypeIndex::Variant}; } /// If we have only Date and DateTime types, convert Date to DateTime, @@ -703,11 +672,12 @@ namespace if (settings.try_infer_dates || settings.try_infer_datetimes) transformDatesAndDateTimes(data_types, type_indexes); - if (settings.try_infer_variant) - transformVariant(data_types, type_indexes); - if constexpr (!is_json) + { + if (settings.try_infer_variant) + transformVariant(data_types, type_indexes); return; + } /// Check settings specific for JSON formats. @@ -740,11 +710,12 @@ namespace /// If there is at least one non Nothing type, change all Nothing types to it. transformNothingComplexTypes(data_types, type_indexes); - if (settings.try_infer_variant) - transformVariant(data_types, type_indexes); - if constexpr (!is_json) + { + if (settings.try_infer_variant) + transformVariant(data_types, type_indexes); return; + } /// Convert JSON tuples with same nested types to arrays. transformTuplesWithEqualNestedTypesToArrays(data_types, type_indexes); diff --git a/tests/queries/0_stateless/03150_infer_type_variant.reference b/tests/queries/0_stateless/03150_infer_type_variant.reference index a5f56cb3618..a43fa1e1227 100644 --- a/tests/queries/0_stateless/03150_infer_type_variant.reference +++ b/tests/queries/0_stateless/03150_infer_type_variant.reference @@ -1,16 +1,16 @@ - ┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ - ┃ arr ┃ toTypeName(arr) ┃ - ┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ -1. │ [1,'Hello',(32)] │ Array(Variant(Int64, String, Tuple( + ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ arr ┃ toTypeName(arr) ┃ + ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +1. │ ['1','Hello',(32)] │ Array(Variant(String, Tuple( a Nullable(Int64)))) │ - └──────────────────┴─────────────────────────────────────────────────────────────┘ - ┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓ - ┃ x ┃ toTypeName(x) ┃ - ┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩ -1. │ 42 │ Variant(Int64, String) │ - ├───────┼────────────────────────┤ -2. │ Hello │ Variant(Int64, String) │ - └───────┴────────────────────────┘ + └────────────────────┴──────────────────────────────────────────────────────┘ + ┏━━━━━━━┳━━━━━━━━━━━━━━━━━━┓ + ┃ x ┃ toTypeName(x) ┃ + ┡━━━━━━━╇━━━━━━━━━━━━━━━━━━┩ +1. │ 42 │ Nullable(String) │ + ├───────┼──────────────────┤ +2. │ Hello │ Nullable(String) │ + └───────┴──────────────────┘ ┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ x ┃ toTypeName(x) ┃ ┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩