Address comments

2024-12-01 03:52:15 +00:00 · 2023-11-20 15:53:28 +00:00 · 2023-11-20 15:53:28 +00:00 · 081fa9f3de
commit 081fa9f3de
parent f537bad469
4 changed files with 22 additions and 8 deletions
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@ -80,7 +80,7 @@ namespace SettingsChangesHistory
 /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
 static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
 {
-    {"23.10", {{"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"},
+    {"23.11", {{"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"},
              {"input_format_orc_allow_missing_columns", false, true, "Allow missing columns in ORC files by default"},
              {"input_format_arrow_allow_missing_columns", false, true, "Allow missing columns in Arrow files by default"}}},
    {"23.9", {{"optimize_group_by_constant_keys", false, true, "Optimize group by constant keys by default"},
--- a/src/Formats/ReadSchemaUtils.cpp
+++ b/src/Formats/ReadSchemaUtils.cpp
@ -55,7 +55,14 @@ try
    NamesAndTypesList names_and_types;
    SchemaInferenceMode mode = context->getSettingsRef().schema_inference_mode;
    if (mode == SchemaInferenceMode::UNION && !FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context, format_settings))
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "UNION schema inference mode is not supported for format {}, because it doesn't support reading subset of columns", format_name);
+    {
+        String additional_message;
+        /// Better exception message for WithNames(AndTypes) formats.
+        if (format_name.ends_with("WithNames") || format_name.ends_with("WithNamesAndTypes"))
+            additional_message = " (formats -WithNames(AndTypes) support reading subset of columns only when setting input_format_with_names_use_header is enabled)";
+
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "UNION schema inference mode is not supported for format {}, because it doesn't support reading subset of columns{}", format_name, additional_message);
+    }

    if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name))
    {
--- a/src/Formats/ReadSchemaUtils.h
+++ b/src/Formats/ReadSchemaUtils.h
@ -57,11 +57,18 @@ private:
 /// use it and won't create a read buffer.
 /// For formats that have a schema reader from the data,
 /// read buffer will be created by the provided iterator and
-/// the schema will be extracted from the data. If schema reader
-/// couldn't determine the schema we will try the next read buffer
-/// from the provided iterator if it makes sense. If the format doesn't
-/// have any schema reader or we couldn't determine the schema,
-/// an exception will be thrown.
+/// the schema will be extracted from the data. If the format doesn't
+/// have any schema reader an exception will be thrown.
+/// Reading schema can be performed in 2 modes depending on setting schema_inference_mode:
+/// 1) Default mode. In this mode ClickHouse assumes that all files have the same schema
+/// and tries to infer the schema by reading files one by one until it succeeds.
+/// If schema reader couldn't determine the schema for some file, ClickHouse will try the next
+/// file (next read buffer from the provided iterator) if it makes sense. If ClickHouse couldn't determine
+/// the resulting schema, an exception will be thrown.
+/// 2) Union mode. In this mode ClickHouse assumes that files can have different schemas,
+/// so it infer schemas of all files and then union them to the common schema. In this mode
+/// all read buffers from provided iterator will be used. If ClickHouse couldn't determine
+/// the schema for some file, an exception will be thrown.
 ColumnsDescription readSchemaFromFormat(
    const String & format_name,
    const std::optional<FormatSettings> & format_settings,
--- a/src/Formats/SchemaInferenceUtils.cpp
+++ b/src/Formats/SchemaInferenceUtils.cpp
@ -577,7 +577,7 @@ namespace
        element_types.reserve(names_to_types.size());
        for (const auto & name : element_names)
        {
-            auto types = names_to_types[name];
+            auto & types = names_to_types[name];
            transformInferredTypesIfNeededImpl<true>(types, settings, json_info);
            /// If some element have different types in different tuples, we can't do anything
            if (!checkIfTypesAreEqual(types))