Make schema inference cache better, respect format settings that can change the schema

2024-09-20 08:40:50 +00:00 · 2022-08-19 16:39:13 +00:00 · 2022-08-19 16:39:13 +00:00 · 612ffaffde
commit 612ffaffde
parent b67cb9e378
28 changed files with 309 additions and 81 deletions
--- a/src/DataTypes/Serializations/SerializationArray.cpp
+++ b/src/DataTypes/Serializations/SerializationArray.cpp
@ -535,7 +535,7 @@ void SerializationArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr,
    readCSV(s, istr, settings.csv);
    ReadBufferFromString rb(s);

-    if (settings.csv.input_format_arrays_as_nested_csv)
+    if (settings.csv.arrays_as_nested_csv)
    {
        deserializeTextImpl(column, rb,
            [&](IColumn & nested_column)
--- a/src/DataTypes/Serializations/SerializationEnum.cpp
+++ b/src/DataTypes/Serializations/SerializationEnum.cpp
@ -24,7 +24,7 @@ void SerializationEnum<Type>::serializeTextEscaped(const IColumn & column, size_
 template <typename Type>
 void SerializationEnum<Type>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
 {
-    if (settings.tsv.input_format_enum_as_number)
+    if (settings.tsv.enum_as_number)
        assert_cast<ColumnType &>(column).getData().push_back(readValue(istr));
    else
    {
@ -52,7 +52,7 @@ void SerializationEnum<Type>::deserializeTextQuoted(IColumn & column, ReadBuffer
 template <typename Type>
 void SerializationEnum<Type>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
 {
-    if (settings.tsv.input_format_enum_as_number)
+    if (settings.tsv.enum_as_number)
    {
        assert_cast<ColumnType &>(column).getData().push_back(readValue(istr));
        if (!istr.eof())
@ -100,7 +100,7 @@ void SerializationEnum<Type>::serializeTextCSV(const IColumn & column, size_t ro
 template <typename Type>
 void SerializationEnum<Type>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
 {
-    if (settings.csv.input_format_enum_as_number)
+    if (settings.csv.enum_as_number)
        assert_cast<ColumnType &>(column).getData().push_back(readValue(istr));
    else
    {
--- a/src/Formats/EscapingRuleUtils.cpp
+++ b/src/Formats/EscapingRuleUtils.cpp
@ -697,7 +697,7 @@ DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSe
            return JSONUtils::getDataTypeFromField(field, format_settings);
        case FormatSettings::EscapingRule::CSV:
        {
-            if (!format_settings.csv.input_format_use_best_effort_in_schema_inference)
+            if (!format_settings.csv.use_best_effort_in_schema_inference)
                return makeNullable(std::make_shared<DataTypeString>());

            if (field.empty() || field == format_settings.csv.null_representation)
@ -745,7 +745,7 @@ DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSe
        case FormatSettings::EscapingRule::Raw: [[fallthrough]];
        case FormatSettings::EscapingRule::Escaped:
        {
-            if (!format_settings.tsv.input_format_use_best_effort_in_schema_inference)
+            if (!format_settings.tsv.use_best_effort_in_schema_inference)
                return makeNullable(std::make_shared<DataTypeString>());

            if (field.empty() || field == format_settings.tsv.null_representation)
@ -799,4 +799,48 @@ DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::E
    return data_types;
 }

+String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule)
+{
+    String result;
+    /// First, settings that are common for all text formats:
+    result = fmt::format(
+        "schema_inference_hints={}, try_infer_integers={}, try_infer_dates={}, try_infer_datetimes={}",
+        settings.schema_inference_hints,
+        settings.try_infer_integers,
+        settings.try_infer_dates,
+        settings.try_infer_datetimes);
+
+    /// Second, format-specific settings:
+    switch (escaping_rule)
+    {
+        case FormatSettings::EscapingRule::Escaped:
+        case FormatSettings::EscapingRule::Raw:
+            result += fmt::format(
+                ", use_best_effort_in_schema_inference={}, bool_true_representation={}, bool_false_representation={}, null_representation={}",
+                settings.tsv.use_best_effort_in_schema_inference,
+                settings.bool_true_representation,
+                settings.bool_false_representation,
+                settings.tsv.null_representation);
+            break;
+        case FormatSettings::EscapingRule::CSV:
+            result += fmt::format(
+                ", use_best_effort_in_schema_inference={}, bool_true_representation={}, bool_false_representation={},"
+                " null_representation={}, delimiter={}, tuple_delimiter={}",
+                settings.tsv.use_best_effort_in_schema_inference,
+                settings.bool_true_representation,
+                settings.bool_false_representation,
+                settings.csv.null_representation,
+                settings.csv.delimiter,
+                settings.csv.tuple_delimiter);
+            break;
+        case FormatSettings::EscapingRule::JSON:
+            result += fmt::format(", try_infer_numbers_from_strings={}, read_bools_as_numbers={}", settings.json.try_infer_numbers_from_strings, settings.json.read_bools_as_numbers);
+            break;
+        default:
+            break;
+    }
+
+    return result;
+}
+
 }
--- a/src/Formats/EscapingRuleUtils.h
+++ b/src/Formats/EscapingRuleUtils.h
@ -77,4 +77,6 @@ void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, c
 void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, const std::unordered_set<const IDataType *> * numbers_parsed_from_json_strings = nullptr);
 void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings);

+String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings,FormatSettings::EscapingRule escaping_rule);
+
 }
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -63,10 +63,10 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
    format_settings.csv.delimiter = settings.format_csv_delimiter;
    format_settings.csv.tuple_delimiter = settings.format_csv_delimiter;
    format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default;
-    format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number;
+    format_settings.csv.enum_as_number = settings.input_format_csv_enum_as_number;
    format_settings.csv.null_representation = settings.format_csv_null_representation;
-    format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
-    format_settings.csv.input_format_use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference;
+    format_settings.csv.arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
+    format_settings.csv.use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference;
    format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines;
    format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
    format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
@ -124,9 +124,9 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
    format_settings.template_settings.row_format = settings.format_template_row;
    format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line;
    format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default;
-    format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number;
+    format_settings.tsv.enum_as_number = settings.input_format_tsv_enum_as_number;
    format_settings.tsv.null_representation = settings.format_tsv_null_representation;
-    format_settings.tsv.input_format_use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference;
+    format_settings.tsv.use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference;
    format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines;
    format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
    format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -111,11 +111,11 @@ struct FormatSettings
        bool allow_double_quotes = true;
        bool empty_as_default = false;
        bool crlf_end_of_line = false;
-        bool input_format_enum_as_number = false;
-        bool input_format_arrays_as_nested_csv = false;
+        bool enum_as_number = false;
+        bool arrays_as_nested_csv = false;
        String null_representation = "\\N";
        char tuple_delimiter = ',';
-        bool input_format_use_best_effort_in_schema_inference = true;
+        bool use_best_effort_in_schema_inference = true;
        UInt64 skip_first_lines = 0;
    } csv;

@ -227,8 +227,8 @@ struct FormatSettings
        bool empty_as_default = false;
        bool crlf_end_of_line = false;
        String null_representation = "\\N";
-        bool input_format_enum_as_number = false;
-        bool input_format_use_best_effort_in_schema_inference = true;
+        bool enum_as_number = false;
+        bool use_best_effort_in_schema_inference = true;
        UInt64 skip_first_lines = 0;
    } tsv;

--- a/src/Processors/Formats/ISchemaReader.cpp
+++ b/src/Processors/Formats/ISchemaReader.cpp
@ -89,15 +89,13 @@ void IIRowSchemaReader::setContext(ContextPtr & context)
 }

 IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
-    : IIRowSchemaReader(in_, format_settings_)
+    : IIRowSchemaReader(in_, format_settings_), column_names(splitColumnNames(format_settings.column_names_for_schema_inference))
 {
-    initColumnNames(format_settings.column_names_for_schema_inference);
 }

 IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_)
-    : IIRowSchemaReader(in_, format_settings_, default_type_)
+    : IIRowSchemaReader(in_, format_settings_, default_type_), column_names(splitColumnNames(format_settings.column_names_for_schema_inference))
 {
-    initColumnNames(format_settings.column_names_for_schema_inference);
 }

 IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, const DataTypes & default_types_)
@ -171,11 +169,12 @@ NamesAndTypesList IRowSchemaReader::readSchema()
    return result;
 }

-void IRowSchemaReader::initColumnNames(const String & column_names_str)
+Strings splitColumnNames(const String & column_names_str)
 {
    if (column_names_str.empty())
-        return;
+        return {};

+    Strings column_names;
    /// column_names_for_schema_inference is a string in format 'column1,column2,column3,...'
    boost::split(column_names, column_names_str, boost::is_any_of(","));
    for (auto & column_name : column_names)
@ -184,6 +183,7 @@ void IRowSchemaReader::initColumnNames(const String & column_names_str)
        if (!col_name_trimmed.empty())
            column_name = col_name_trimmed;
    }
+    return column_names;
 }

 DataTypePtr IRowSchemaReader::getDefaultType(size_t column) const
--- a/src/Processors/Formats/ISchemaReader.h
+++ b/src/Processors/Formats/ISchemaReader.h
@ -136,4 +136,6 @@ void chooseResultColumnType(
 void checkResultColumnTypeAndAppend(
    NamesAndTypesList & result, DataTypePtr & type, const String & name, const DataTypePtr & default_type, size_t rows_read);

+Strings splitColumnNames(const String & column_names_str);
+
 }
--- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
@ -401,6 +401,13 @@ void registerCSVSchemaReader(FormatFactory & factory)
        {
            return std::make_shared<CSVSchemaReader>(buf, with_names, with_types, settings);
        });
+        factory.registerAdditionalInfoForSchemaCacheGetter(format_name, [with_names](const FormatSettings & settings)
+        {
+            String result = getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::CSV);
+            if (!with_names)
+                result += fmt::format(", column_names_for_schema_inference={}", settings.column_names_for_schema_inference);
+            return result;
+        });
    };

    registerWithNamesAndTypes("CSV", register_func);
--- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp
@ -319,7 +319,7 @@ void registerInputFormatCapnProto(FormatFactory & factory)
    factory.markFormatSupportsSubsetOfColumns("CapnProto");
    factory.registerFileExtension("capnp", "CapnProto");
    factory.registerAdditionalInfoForSchemaCacheGetter(
-        "CapnProto", [](const FormatSettings & settings) { return "Format schema: " + settings.schema.format_schema; });
+        "CapnProto", [](const FormatSettings & settings) { return fmt::format("format_schema={}", settings.schema.format_schema); });
 }

 void registerCapnProtoSchemaReader(FormatFactory & factory)
--- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp
@ -353,6 +353,19 @@ void registerCustomSeparatedSchemaReader(FormatFactory & factory)
            {
                return std::make_shared<CustomSeparatedSchemaReader>(buf, with_names, with_types, ignore_spaces, settings);
            });
+            factory.registerAdditionalInfoForSchemaCacheGetter(format_name, [](const FormatSettings & settings)
+            {
+                String result = getAdditionalFormatInfoByEscapingRule(settings, settings.custom.escaping_rule);
+                return result + fmt::format(
+                        ", result_before_delimiter={}, row_before_delimiter={}, field_delimiter={},"
+                        " row_after_delimiter={}, row_between_delimiter={}, result_after_delimiter={}",
+                        settings.custom.result_before_delimiter,
+                        settings.custom.row_before_delimiter,
+                        settings.custom.field_delimiter,
+                        settings.custom.row_after_delimiter,
+                        settings.custom.row_between_delimiter,
+                        settings.custom.result_after_delimiter);
+            });
        };

        registerWithNamesAndTypes(ignore_spaces ? "CustomSeparatedIgnoreSpaces" : "CustomSeparated", register_func);
--- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp
@ -1,6 +1,7 @@
 #include <Processors/Formats/Impl/JSONColumnsBlockInputFormat.h>
 #include <IO/ReadHelpers.h>
 #include <Formats/FormatFactory.h>
+#include <Formats/EscapingRuleUtils.h>

 namespace DB
 {
@ -66,6 +67,10 @@ void registerJSONColumnsSchemaReader(FormatFactory & factory)
            return std::make_shared<JSONColumnsSchemaReaderBase>(buf, settings, std::make_unique<JSONColumnsReader>(buf));
        }
    );
+    factory.registerAdditionalInfoForSchemaCacheGetter("JSONColumns", [](const FormatSettings & settings)
+    {
+        return getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON);
+    });
 }

 }
--- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp
+++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp
@ -1,4 +1,5 @@
 #include <Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h>
+#include <Processors/Formats/ISchemaReader.h>
 #include <Formats/JSONUtils.h>
 #include <Formats/EscapingRuleUtils.h>
 #include <IO/ReadHelpers.h>
@ -178,7 +179,10 @@ Chunk JSONColumnsBlockInputFormatBase::generate()

 JSONColumnsSchemaReaderBase::JSONColumnsSchemaReaderBase(
    ReadBuffer & in_, const FormatSettings & format_settings_, std::unique_ptr<JSONColumnsReaderBase> reader_)
-    : ISchemaReader(in_), format_settings(format_settings_), reader(std::move(reader_))
+    : ISchemaReader(in_)
+    , format_settings(format_settings_)
+    , reader(std::move(reader_))
+    , column_names_from_settings(splitColumnNames(format_settings_.column_names_for_schema_inference))
 {
 }

@ -214,8 +218,15 @@ NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema()
        do
        {
            auto column_name_opt = reader->readColumnStart();
-            /// If format doesn't have named for columns, use default names 'c1', 'c2', ...
-            String column_name = column_name_opt.has_value() ? *column_name_opt : "c" + std::to_string(iteration + 1);
+            /// If format doesn't have named for columns, use names from setting column_names_for_schema_inference or default names 'c1', 'c2', ...
+            String column_name;
+            if (column_name_opt.has_value())
+                column_name = *column_name_opt;
+            else if (iteration < column_names_from_settings.size())
+                column_name = column_names_from_settings[iteration];
+            else
+                column_name = "c" + std::to_string(iteration + 1);
+
            /// Keep order of column names as it is in input data.
            if (!names_to_types.contains(column_name))
                names_order.push_back(column_name);
--- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h
+++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h
@ -87,6 +87,7 @@ private:

    const FormatSettings format_settings;
    std::unique_ptr<JSONColumnsReaderBase> reader;
+    Names column_names_from_settings;
 };

 }
--- a/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp
@ -1,6 +1,7 @@
 #include <Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.h>
 #include <IO/ReadHelpers.h>
 #include <Formats/FormatFactory.h>
+#include <Formats/EscapingRuleUtils.h>

 namespace DB
 {
@ -60,6 +61,11 @@ void registerJSONCompactColumnsSchemaReader(FormatFactory & factory)
            return std::make_shared<JSONColumnsSchemaReaderBase>(buf, settings, std::make_unique<JSONCompactColumnsReader>(buf));
        }
    );
+    factory.registerAdditionalInfoForSchemaCacheGetter("JSONCompactColumns", [](const FormatSettings & settings)
+    {
+        auto result = getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON);
+        return result + fmt::format(", column_names_for_schema_inference={}", settings.column_names_for_schema_inference);
+    });
 }

 }
--- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp
@ -245,6 +245,11 @@ void registerJSONCompactEachRowSchemaReader(FormatFactory & factory)
            {
                return std::make_shared<JSONCompactEachRowRowSchemaReader>(buf, with_names, with_types, json_strings, settings);
            });
+            factory.registerAdditionalInfoForSchemaCacheGetter(format_name, [](const FormatSettings & settings)
+            {
+                auto result = getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON);
+                return result + fmt::format(", column_names_for_schema_inference={}", settings.column_names_for_schema_inference);
+            });
        };
        registerWithNamesAndTypes(json_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func);
    }
--- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp
@ -355,44 +355,26 @@ void JSONEachRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTyp

 void registerInputFormatJSONEachRow(FormatFactory & factory)
 {
-    factory.registerInputFormat("JSONEachRow", [](
-        ReadBuffer & buf,
-        const Block & sample,
-        IRowInputFormat::Params params,
-        const FormatSettings & settings)
+    auto register_format = [&](const String & format_name, bool json_strings)
    {
-        return std::make_shared<JSONEachRowRowInputFormat>(buf, sample, std::move(params), settings, false);
-    });
+        factory.registerInputFormat(format_name, [json_strings](
+            ReadBuffer & buf,
+            const Block & sample,
+            IRowInputFormat::Params params,
+            const FormatSettings & settings)
+        {
+            return std::make_shared<JSONEachRowRowInputFormat>(buf, sample, std::move(params), settings, json_strings);
+        });
+    };

-    factory.registerInputFormat("JSONLines", [](
-        ReadBuffer & buf,
-        const Block & sample,
-        IRowInputFormat::Params params,
-        const FormatSettings & settings)
-    {
-        return std::make_shared<JSONEachRowRowInputFormat>(buf, sample, std::move(params), settings, false);
-    });
-
-    factory.registerInputFormat("NDJSON", [](
-        ReadBuffer & buf,
-        const Block & sample,
-        IRowInputFormat::Params params,
-        const FormatSettings & settings)
-    {
-        return std::make_shared<JSONEachRowRowInputFormat>(buf, sample, std::move(params), settings, false);
-    });
+    register_format("JSONEachRow", false);
+    register_format("JSONLines", false);
+    register_format("NDJSON", false);

    factory.registerFileExtension("ndjson", "JSONEachRow");
    factory.registerFileExtension("jsonl", "JSONEachRow");

-    factory.registerInputFormat("JSONStringsEachRow", [](
-        ReadBuffer & buf,
-        const Block & sample,
-        IRowInputFormat::Params params,
-        const FormatSettings & settings)
-    {
-        return std::make_shared<JSONEachRowRowInputFormat>(buf, sample, std::move(params), settings, true);
-    });
+    register_format("JSONStringsEachRow", true);

    factory.markFormatSupportsSubsetOfColumns("JSONEachRow");
    factory.markFormatSupportsSubsetOfColumns("JSONLines");
@ -418,25 +400,22 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory

 void registerJSONEachRowSchemaReader(FormatFactory & factory)
 {
-    factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings)
+    auto register_schema_reader = [&](const String & format_name, bool json_strings)
    {
-        return std::make_unique<JSONEachRowSchemaReader>(buf, false, settings);
-    });
+        factory.registerSchemaReader(format_name, [json_strings](ReadBuffer & buf, const FormatSettings & settings)
+        {
+            return std::make_unique<JSONEachRowSchemaReader>(buf, json_strings, settings);
+        });
+        factory.registerAdditionalInfoForSchemaCacheGetter(format_name, [](const FormatSettings & settings)
+        {
+            return getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON);
+        });
+    };

-    factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings)
-    {
-        return std::make_unique<JSONEachRowSchemaReader>(buf, true, settings);
-    });
-
-    factory.registerSchemaReader("JSONLines", [](ReadBuffer & buf, const FormatSettings & settings)
-    {
-        return std::make_unique<JSONEachRowSchemaReader>(buf, false, settings);
-    });
-
-    factory.registerSchemaReader("NDJSON", [](ReadBuffer & buf, const FormatSettings & settings)
-    {
-        return std::make_unique<JSONEachRowSchemaReader>(buf, false, settings);
-    });
+    register_schema_reader("JSONEachRow", false);
+    register_schema_reader("JSONLines", false);
+    register_schema_reader("NDJSON", false);
+    register_schema_reader("JSONStringsEachRow", true);
 }

 }
--- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp
@ -539,6 +539,10 @@ void registerMsgPackSchemaReader(FormatFactory & factory)
    {
        return std::make_shared<MsgPackSchemaReader>(buf, settings);
    });
+    factory.registerAdditionalInfoForSchemaCacheGetter("MsgPack", [](const FormatSettings & settings)
+    {
+        return fmt::format("number_of_columns={}", settings.msgpack.number_of_columns);
+    });
 }

 }
--- a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp
@ -452,9 +452,6 @@ void registerInputFormatMySQLDump(FormatFactory & factory)
    {
        return std::make_shared<MySQLDumpRowInputFormat>(buf, header, params, settings);
    });
-
-    factory.registerAdditionalInfoForSchemaCacheGetter(
-        "MySQLDump", [](const FormatSettings & settings) { return "Table name: " + settings.mysql_dump.table_name; });
 }

 void registerMySQLSchemaReader(FormatFactory & factory)
@ -463,6 +460,12 @@ void registerMySQLSchemaReader(FormatFactory & factory)
    {
        return std::make_shared<MySQLDumpSchemaReader>(buf, settings);
    });
+
+    factory.registerAdditionalInfoForSchemaCacheGetter("MySQLDump", [](const FormatSettings & settings)
+    {
+        auto result = getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::Quoted);
+        return result + fmt::format(", table_name={}", settings.mysql_dump.table_name);
+    });
 }


--- a/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp
@ -82,7 +82,7 @@ void registerInputFormatProtobufList(FormatFactory & factory)
            });
    factory.markFormatSupportsSubsetOfColumns("ProtobufList");
    factory.registerAdditionalInfoForSchemaCacheGetter(
-        "ProtobufList", [](const FormatSettings & settings) { return "Format schema: " + settings.schema.format_schema; });
+        "ProtobufList", [](const FormatSettings & settings) { return fmt::format("format_schema={}", settings.schema.format_schema); });
 }

 void registerProtobufListSchemaReader(FormatFactory & factory)
--- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp
@ -104,7 +104,7 @@ void registerProtobufSchemaReader(FormatFactory & factory)

    for (const auto & name : {"Protobuf", "ProtobufSingle"})
        factory.registerAdditionalInfoForSchemaCacheGetter(
-            name, [](const FormatSettings & settings) { return "Format schema: " + settings.schema.format_schema; });
+            name, [](const FormatSettings & settings) { return fmt::format("format_schema={}", settings.schema.format_schema); });
 }

 }
--- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp
@ -211,6 +211,11 @@ void registerRegexpSchemaReader(FormatFactory & factory)
    {
        return std::make_shared<RegexpSchemaReader>(buf, settings);
    });
+    factory.registerAdditionalInfoForSchemaCacheGetter("Regexp", [](const FormatSettings & settings)
+    {
+        auto result = getAdditionalFormatInfoByEscapingRule(settings, settings.regexp.escaping_rule);
+        return result + fmt::format(", regexp={}", settings.regexp.regexp);
+    });
 }

 }
--- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp
@ -285,6 +285,10 @@ void registerTSKVSchemaReader(FormatFactory & factory)
    {
        return std::make_shared<TSKVSchemaReader>(buf, settings);
    });
+    factory.registerAdditionalInfoForSchemaCacheGetter("TSKV", [](const FormatSettings & settings)
+    {
+        return getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::Escaped);
+    });
 }

 }
--- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp
@ -302,6 +302,14 @@ void registerTSVSchemaReader(FormatFactory & factory)
            {
                return std::make_shared<TabSeparatedSchemaReader>(buf, with_names, with_types, is_raw, settings);
            });
+            factory.registerAdditionalInfoForSchemaCacheGetter(format_name, [with_names, is_raw](const FormatSettings & settings)
+            {
+                String result = getAdditionalFormatInfoByEscapingRule(
+                    settings, is_raw ? FormatSettings::EscapingRule::Raw : FormatSettings::EscapingRule::Escaped);
+                if (!with_names)
+                    result += fmt::format(", column_names_for_schema_inference={}", settings.column_names_for_schema_inference);
+                return result;
+            });
        };

        registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func);
--- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp
@ -566,13 +566,32 @@ void registerTemplateSchemaReader(FormatFactory & factory)
 {
    for (bool ignore_spaces : {false, true})
    {
-        factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings)
+        String format_name = ignore_spaces ? "TemplateIgnoreSpaces" : "Template";
+        factory.registerSchemaReader(format_name, [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings)
        {
            size_t index = 0;
            auto idx_getter = [&](const String &) -> std::optional<size_t> { return index++; };
            auto row_format = fillRowFormat(settings, idx_getter, false);
            return std::make_shared<TemplateSchemaReader>(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings);
        });
+        factory.registerAdditionalInfoForSchemaCacheGetter(format_name, [](const FormatSettings & settings)
+        {
+            size_t index = 0;
+            auto idx_getter = [&](const String &) -> std::optional<size_t> { return index++; };
+            auto row_format = fillRowFormat(settings, idx_getter, false);
+            std::unordered_set<FormatSettings::EscapingRule> visited_escaping_rules;
+            String result = fmt::format("row_format={}, resultset_format={}, row_between_delimiter={}",
+                settings.template_settings.row_format,
+                settings.template_settings.resultset_format,
+                settings.template_settings.row_between_delimiter);
+            for (auto escaping_rule : row_format.escaping_rules)
+            {
+                if (!visited_escaping_rules.contains(escaping_rule))
+                    result += ", " + getAdditionalFormatInfoByEscapingRule(settings, settings.regexp.escaping_rule);
+                visited_escaping_rules.insert(escaping_rule);
+            }
+            return result;
+        });
    }
 }

--- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp
@ -634,6 +634,10 @@ void registerValuesSchemaReader(FormatFactory & factory)
    {
        return std::make_shared<ValuesSchemaReader>(buf, settings);
    });
+    factory.registerAdditionalInfoForSchemaCacheGetter("Values", [](const FormatSettings & settings)
+    {
+        return getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::Quoted);
+    });
 }

 }
--- a/tests/queries/0_stateless/02404_schema_inference_cache_respect_format_settings.reference
+++ b/tests/queries/0_stateless/02404_schema_inference_cache_respect_format_settings.reference
@ -0,0 +1,90 @@
+TSV
+c1	Nullable(Int64)					
+c2	Nullable(Date)					
+c1	Nullable(Float64)					
+c2	Nullable(Date)					
+c1	Nullable(Int64)					
+c2	Nullable(DateTime64(9))					
+c1	UInt8					
+c2	Nullable(Date)					
+4
+TSVWithNames
+number	Nullable(Int64)					
+toDate(number)	Nullable(Date)					
+number	Nullable(Float64)					
+toDate(number)	Nullable(Date)					
+number	Nullable(Int64)					
+toDate(number)	Nullable(DateTime64(9))					
+number	Nullable(Int64)					
+toDate(number)	Nullable(Date)					
+4
+CSV
+c1	Nullable(Int64)					
+c2	Nullable(Date)					
+c1	Nullable(Float64)					
+c2	Nullable(Date)					
+c1	Nullable(Int64)					
+c2	Nullable(DateTime64(9))					
+c1	UInt8					
+c2	Nullable(Date)					
+4
+CSVWithNames
+number	Nullable(Int64)					
+toDate(number)	Nullable(Date)					
+number	Nullable(Float64)					
+toDate(number)	Nullable(Date)					
+number	Nullable(Int64)					
+toDate(number)	Nullable(DateTime64(9))					
+number	Nullable(Int64)					
+toDate(number)	Nullable(Date)					
+4
+TSKV
+number	Nullable(Int64)					
+toDate(number)	Nullable(Date)					
+number	Nullable(Float64)					
+toDate(number)	Nullable(Date)					
+number	Nullable(Int64)					
+toDate(number)	Nullable(DateTime64(9))					
+number	Nullable(Int64)					
+toDate(number)	Nullable(Date)					
+4
+CustomSeparated
+c1	Nullable(Int64)					
+c2	Nullable(Date)					
+c1	Nullable(Float64)					
+c2	Nullable(Date)					
+c1	Nullable(Int64)					
+c2	Nullable(DateTime64(9))					
+c1	UInt8					
+c2	Nullable(Date)					
+4
+JSONEachRow
+number	Nullable(Int64)					
+toDate(number)	Nullable(Date)					
+number	Nullable(Float64)					
+toDate(number)	Nullable(Date)					
+number	Nullable(Int64)					
+toDate(number)	Nullable(DateTime64(9))					
+number	Nullable(Int64)					
+toDate(number)	Nullable(Date)					
+4
+JSONCompactEachRow
+c1	Nullable(Int64)					
+c2	Nullable(Date)					
+c1	Nullable(Float64)					
+c2	Nullable(Date)					
+c1	Nullable(Int64)					
+c2	Nullable(DateTime64(9))					
+c1	UInt8					
+c2	Nullable(Date)					
+4
+Values
+c1	Nullable(Int64)					
+c2	Nullable(Date)					
+c1	Nullable(Float64)					
+c2	Nullable(Date)					
+c1	Nullable(Int64)					
+c2	Nullable(DateTime64(9))					
+c1	UInt8					
+c2	Nullable(Date)					
+4
--- a/tests/queries/0_stateless/02404_schema_inference_cache_respect_format_settings.sql.j2
+++ b/tests/queries/0_stateless/02404_schema_inference_cache_respect_format_settings.sql.j2
@ -0,0 +1,16 @@
+-- Tags: no-parallel, no-fasttest
+
+system drop schema cache for file;
+
+{% for format in ['TSV', 'TSVWithNames', 'CSV', 'CSVWithNames', 'TSKV', 'CustomSeparated', 'JSONEachRow', 'JSONCompactEachRow', 'Values'] -%}
+
+select '{{ format }}';
+insert into function file(02404_data.{{ format }}) select number, toDate(number) from numbers(10);
+desc file(02404_data.{{ format }});
+desc file(02404_data.{{ format }}) settings input_format_try_infer_integers=0;
+desc file(02404_data.{{ format }}) settings input_format_try_infer_dates=0;
+desc file(02404_data.{{ format }}) settings schema_inference_hints='c1 UInt8';
+select count() from system.schema_inference_cache where countSubstrings(source, '02404_data.{{ format }}') > 0;
+
+{% endfor -%}
+