Review changes

2024-09-19 16:20:50 +00:00 · 2024-06-11 11:11:06 +02:00 · 2024-06-11 11:11:06 +02:00 · 9cf11a210f
commit 9cf11a210f
parent 418fc7f443
3 changed files with 35 additions and 65 deletions
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@ -92,6 +92,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
              {"hdfs_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in HDFS table engine"},
              {"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"},
              {"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"},
+              {"input_format_try_infer_variants", 0, 0, "Try to infer Variant type in text formats when there is more than one possible type for column/array elements"},
              }},
    {"24.5", {{"allow_deprecated_functions", true, false, "Allow usage of deprecated functions"},
              {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."},
@ -103,8 +104,6 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
              {"prefer_external_sort_block_bytes", 0, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging."},
              {"input_format_parquet_use_native_reader", false, false, "When reading Parquet files, to use native reader instead of arrow reader."},
              {"input_format_force_null_for_omitted_fields", false, false, "Disable type-defaults for omitted fields when needed"},
-              {"output_format_pretty_preserve_border_for_multiline_string", 1, 1, "Applies better rendering for multiline strings."},
-              {"input_format_try_infer_variants", 0, 0, "Try to infer Variant type in text formats when there is more than one possible type for column/array elements"},
              {"cast_string_to_dynamic_use_inference", false, false, "Add setting to allow converting String to Dynamic through parsing"},
              {"allow_experimental_dynamic_type", false, false, "Add new experimental Dynamic type"},
              {"azure_max_blocks_in_multipart_upload", 50000, 50000, "Maximum number of blocks in multipart upload for Azure."},
--- a/src/Formats/SchemaInferenceUtils.cpp
+++ b/src/Formats/SchemaInferenceUtils.cpp
@ -239,16 +239,6 @@ namespace
        return true;
    }

-    bool checkIfTypesContainVariant(const DataTypes & types)
-    {
-        for (size_t i = 0; i < types.size(); ++i)
-        {
-            if (isVariant(types[i]))
-                return true;
-        }
-        return false;
-    }
-
    void updateTypeIndexes(DataTypes & data_types, TypeIndexesSet & type_indexes)
    {
        type_indexes.clear();
@ -321,49 +311,28 @@ namespace
    /// if setting 'try_infer_variant' is true then we convert to type variant.
    void transformVariant(DataTypes & data_types, TypeIndexesSet & type_indexes)
    {
-        auto typesAreEqual = checkIfTypesAreEqual(data_types);
-        auto typesContainVariant = checkIfTypesContainVariant(data_types);
-        if (typesAreEqual)
+        if (checkIfTypesAreEqual(data_types))
            return;

-        DataTypes new_data_types;
-        TypeIndexesSet new_type_indexes;
-        std::shared_ptr<DataTypeVariant> variant_type;
-
-        /// extract the nested types of variant and make a new variant with the nested types and the other type.
-        /// eg. Type 1: variant<String, Array>, Type 2: Date -> variant<String, Array, Date>.
-        if (typesContainVariant)
+        DataTypes variant_types;
+        for (const auto & type : data_types)
        {
-            DataTypes extracted_types;
-            for (size_t i=0; i<data_types.size(); i++)
+            if (const auto * variant_type = typeid_cast<const DataTypeVariant *>(type.get()))
            {
-                if (isVariant(data_types[i]))
-                {
-                    if (const auto * variant = typeid_cast<const DataTypeVariant *>(data_types[i].get()))
-                        extracted_types = variant->getVariants();
-                }
-                else
-                    extracted_types.push_back(data_types[i]);
+                const auto & current_variants = variant_type->getVariants();
+                variant_types.insert(variant_types.end(), current_variants.begin(), current_variants.end());
+            }
+            else
+            {
+                variant_types.push_back(type);
            }
-            variant_type = std::make_shared<DataTypeVariant>(extracted_types);
-        }
-        else
-        {
-            variant_type = std::make_shared<DataTypeVariant>(data_types);
        }

-        size_t i = 0;
-        while (i != data_types.size())
-        {
-            new_data_types.push_back(variant_type);
-            new_type_indexes.insert(TypeIndex::Variant);
-            i++;
-        }
+        auto variant_type = std::make_shared<DataTypeVariant>(variant_types);

-        data_types.clear();
-        type_indexes.clear();
-        data_types = new_data_types;
-        type_indexes = new_type_indexes;
+        for (auto & type : data_types)
+            type = variant_type;
+        type_indexes = {TypeIndex::Variant};
    }

    /// If we have only Date and DateTime types, convert Date to DateTime,
@ -703,11 +672,12 @@ namespace
            if (settings.try_infer_dates || settings.try_infer_datetimes)
                transformDatesAndDateTimes(data_types, type_indexes);

-            if (settings.try_infer_variant)
-                transformVariant(data_types, type_indexes);
-
            if constexpr (!is_json)
+            {
+                if (settings.try_infer_variant)
+                    transformVariant(data_types, type_indexes);
                return;
+            }

            /// Check settings specific for JSON formats.

@ -740,11 +710,12 @@ namespace
            /// If there is at least one non Nothing type, change all Nothing types to it.
            transformNothingComplexTypes(data_types, type_indexes);

-            if (settings.try_infer_variant)
-                transformVariant(data_types, type_indexes);
-
            if constexpr (!is_json)
+            {
+                if (settings.try_infer_variant)
+                    transformVariant(data_types, type_indexes);
                return;
+            }

            /// Convert JSON tuples with same nested types to arrays.
            transformTuplesWithEqualNestedTypesToArrays(data_types, type_indexes);
--- a/tests/queries/0_stateless/03150_infer_type_variant.reference
+++ b/tests/queries/0_stateless/03150_infer_type_variant.reference
@ -1,16 +1,16 @@
-   ┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
-   ┃ arr              ┃ toTypeName(arr)                                             ┃
-   ┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
-1. │ [1,'Hello',(32)] │ Array(Variant(Int64, String, Tuple(
+   ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+   ┃ arr                ┃ toTypeName(arr)                                      ┃
+   ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+1. │ ['1','Hello',(32)] │ Array(Variant(String, Tuple(
    a Nullable(Int64)))) │
-   └──────────────────┴─────────────────────────────────────────────────────────────┘
-   ┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓
-   ┃ x     ┃ toTypeName(x)          ┃
-   ┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩
-1. │ 42    │ Variant(Int64, String) │
-   ├───────┼────────────────────────┤
-2. │ Hello │ Variant(Int64, String) │
-   └───────┴────────────────────────┘
+   └────────────────────┴──────────────────────────────────────────────────────┘
+   ┏━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
+   ┃ x     ┃ toTypeName(x)    ┃
+   ┡━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
+1. │ 42    │ Nullable(String) │
+   ├───────┼──────────────────┤
+2. │ Hello │ Nullable(String) │
+   └───────┴──────────────────┘
   ┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
   ┃ x       ┃ toTypeName(x)                                                 ┃
   ┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩