diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index c448d0aee47..a757a032b7d 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -329,8 +329,8 @@ SELECT count() FROM system.schema_inference_cache WHERE storage='S3' ## Text formats {#text-formats} For text formats, ClickHouse reads the data row by row, extracts column values according to the format, -and then uses some recursive parsers and heuristics to determine the type for each value. The maximum number of rows read from the data in schema inference -is controlled by the setting `input_format_max_rows_to_read_for_schema_inference` with default value 25000. +and then uses some recursive parsers and heuristics to determine the type for each value. The maximum number of rows and bytes read from the data in schema inference +is controlled by the settings `input_format_max_rows_to_read_for_schema_inference` (25000 by default) and `input_format_max_bytes_to_read_for_schema_inference` (32Mb by default). By default, all inferred types are [Nullable](../sql-reference/data-types/nullable.md), but you can change this by setting `schema_inference_make_columns_nullable` (see examples in the [settings](#settings-for-text-formats) section). ### JSON formats {#json-formats} @@ -1144,13 +1144,15 @@ Line: value_1=2, value_2="Some string 2", value_3="[4, 5, NULL]"$$) ### Settings for text formats {#settings-for-text-formats} -#### input_format_max_rows_to_read_for_schema_inference +#### input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference -This setting controls the maximum number of rows to be read while schema inference. -The more rows are read, the more time is spent on schema inference, but the greater the chance to +These settings control the amount of data to be read while schema inference. +The more rows/bytes are read, the more time is spent on schema inference, but the greater the chance to correctly determine the types (especially when the data contains a lot of nulls). -Default value: `25000`. +Default values: +- `25000` for `input_format_max_rows_to_read_for_schema_inference`. +- `33554432` (32 Mb) for `input_format_max_bytes_to_read_for_schema_inference`. #### column_names_for_schema_inference @@ -1623,7 +1625,7 @@ In schema inference for CapnProto format ClickHouse uses the following type matc ## Strong-typed binary formats {#strong-typed-binary-formats} In such formats, each serialized value contains information about its type (and possibly about its name), but there is no information about the whole table. -In schema inference for such formats, ClickHouse reads data row by row (up to `input_format_max_rows_to_read_for_schema_inference` rows) and extracts +In schema inference for such formats, ClickHouse reads data row by row (up to `input_format_max_rows_to_read_for_schema_inference` rows or `input_format_max_bytes_to_read_for_schema_inference` bytes) and extracts the type (and possibly name) for each value from the data and then converts these types to ClickHouse types. ### MsgPack {#msgpack} diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 65038d3a256..e4a8c916bcf 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -137,6 +137,12 @@ The maximum rows of data to read for automatic schema inference. Default value: `25'000`. +## input_format_max_bytes_to_read_for_schema_inference {#input_format_max_bytes_to_read_for_schema_inference} + +The maximum amount of data in bytes to read for automatic schema inference. + +Default value: `33554432` (32 Mb). + ## column_names_for_schema_inference {#column_names_for_schema_inference} The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...' diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 67c92a0be8b..f1e6c518f30 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -844,6 +844,7 @@ class IColumn; M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \ M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \ M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \ + M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, "The maximum bytes of data to read for automatic schema inference", 0) \ M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \ diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 804f32e4b46..9f744218da2 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -408,9 +408,10 @@ DataTypes getDefaultDataTypeForEscapingRules(const std::vectormax_rows_to_read_for_schema_inference : context->getSettingsRef().input_format_max_rows_to_read_for_schema_inference; + size_t max_bytes_to_read = format_settings ? format_settings->max_bytes_to_read_for_schema_inference + : context->getSettingsRef().input_format_max_bytes_to_read_for_schema_inference; size_t iterations = 0; ColumnsDescription cached_columns; while (true) @@ -120,7 +122,7 @@ ColumnsDescription readSchemaFromFormat( try { schema_reader = FormatFactory::instance().getSchemaReader(format_name, *buf, context, format_settings); - schema_reader->setMaxRowsToRead(max_rows_to_read); + schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); names_and_types = schema_reader->readSchema(); break; } @@ -132,10 +134,14 @@ ColumnsDescription readSchemaFromFormat( size_t rows_read = schema_reader->getNumRowsRead(); assert(rows_read <= max_rows_to_read); max_rows_to_read -= schema_reader->getNumRowsRead(); - if (rows_read != 0 && max_rows_to_read == 0) + size_t bytes_read = buf->count(); + /// We could exceed max_bytes_to_read a bit to complete row parsing. + max_bytes_to_read -= std::min(bytes_read, max_bytes_to_read); + if (rows_read != 0 && (max_rows_to_read == 0 || max_bytes_to_read == 0)) { - exception_message += "\nTo increase the maximum number of rows to read for structure determination, use setting " - "input_format_max_rows_to_read_for_schema_inference"; + exception_message += "\nTo increase the maximum number of rows/bytes to read for structure determination, use setting " + "input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference"; + if (iterations > 1) { exception_messages += "\n" + exception_message; diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index c96cb373a2d..9f26a3543d0 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -57,11 +57,15 @@ void checkFinalInferredType( } IIRowSchemaReader::IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_) - : ISchemaReader(in_), default_type(default_type_), hints_str(format_settings_.schema_inference_hints), format_settings(format_settings_) + : ISchemaReader(in_) + , max_rows_to_read(format_settings_.max_rows_to_read_for_schema_inference) + , max_bytes_to_read(format_settings_.max_bytes_to_read_for_schema_inference) + , default_type(default_type_) + , hints_str(format_settings_.schema_inference_hints) + , format_settings(format_settings_) { } - void IIRowSchemaReader::setContext(ContextPtr & context) { ColumnsDescription columns; @@ -99,11 +103,11 @@ IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & form NamesAndTypesList IRowSchemaReader::readSchema() { - if (max_rows_to_read == 0) + if (max_rows_to_read == 0 || max_bytes_to_read == 0) throw Exception( ErrorCodes::BAD_ARGUMENTS, - "Cannot read rows to determine the schema, the maximum number of rows to read is set to 0. " - "Most likely setting input_format_max_rows_to_read_for_schema_inference is set to 0"); + "Cannot read rows to determine the schema, the maximum number of rows (or bytes) to read is set to 0. " + "Most likely setting input_format_max_rows_to_read_for_schema_inference or input_format_max_bytes_to_read_for_schema_inference is set to 0"); DataTypes data_types = readRowAndGetDataTypes(); @@ -143,7 +147,7 @@ NamesAndTypesList IRowSchemaReader::readSchema() data_types[i] = hint_it->second; } - for (rows_read = 1; rows_read < max_rows_to_read; ++rows_read) + for (rows_read = 1; rows_read < max_rows_to_read && in.count() < max_bytes_to_read; ++rows_read) { DataTypes new_data_types = readRowAndGetDataTypes(); if (new_data_types.empty()) @@ -220,11 +224,11 @@ IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, const For NamesAndTypesList IRowWithNamesSchemaReader::readSchema() { - if (max_rows_to_read == 0) + if (max_rows_to_read == 0 || max_bytes_to_read == 0) throw Exception( ErrorCodes::BAD_ARGUMENTS, - "Cannot read rows to determine the schema, the maximum number of rows to read is set to 0. " - "Most likely setting input_format_max_rows_to_read_for_schema_inference is set to 0"); + "Cannot read rows to determine the schema, the maximum number of rows (or bytes) to read is set to 0. " + "Most likely setting input_format_max_rows_to_read_for_schema_inference or input_format_max_bytes_to_read_for_schema_inference is set to 0"); bool eof = false; auto names_and_types = readRowAndGetNamesAndDataTypes(eof); @@ -245,7 +249,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() names_order.push_back(name); } - for (rows_read = 1; rows_read < max_rows_to_read; ++rows_read) + for (rows_read = 1; rows_read < max_rows_to_read && in.count() < max_bytes_to_read; ++rows_read) { auto new_names_and_types = readRowAndGetNamesAndDataTypes(eof); if (eof) diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 78b34a07840..40702198a57 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -32,7 +32,7 @@ public: virtual bool needContext() const { return false; } virtual void setContext(ContextPtr &) {} - virtual void setMaxRowsToRead(size_t) {} + virtual void setMaxRowsAndBytesToRead(size_t, size_t) {} virtual size_t getNumRowsRead() const { return 0; } virtual ~ISchemaReader() = default; @@ -54,12 +54,17 @@ public: virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type); protected: - void setMaxRowsToRead(size_t max_rows) override { max_rows_to_read = max_rows; } + void setMaxRowsAndBytesToRead(size_t max_rows, size_t max_bytes) override + { + max_rows_to_read = max_rows; + max_bytes_to_read = max_bytes; + } size_t getNumRowsRead() const override { return rows_read; } virtual void transformFinalTypeIfNeeded(DataTypePtr &) {} size_t max_rows_to_read; + size_t max_bytes_to_read; size_t rows_read = 0; DataTypePtr default_type; String hints_str; diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp index 8d4c4b0c6cf..3d003658e64 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp @@ -55,7 +55,7 @@ void registerJSONColumnsSchemaReader(FormatFactory & factory) ); factory.registerAdditionalInfoForSchemaCacheGetter("JSONColumns", [](const FormatSettings & settings) { - return getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON); + return getAdditionalFormatInfoForAllRowBasedFormats(settings) + getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON); }); } diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp index 2e264c59f56..84a07ebc8fb 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp @@ -176,6 +176,8 @@ JSONColumnsSchemaReaderBase::JSONColumnsSchemaReaderBase( , hints_str(format_settings_.schema_inference_hints) , reader(std::move(reader_)) , column_names_from_settings(splitColumnNames(format_settings_.column_names_for_schema_inference)) + , max_rows_to_read(format_settings_.max_rows_to_read_for_schema_inference) + , max_bytes_to_read(format_settings_.max_bytes_to_read_for_schema_inference) { } @@ -196,12 +198,12 @@ void JSONColumnsSchemaReaderBase::transformTypesIfNeeded(DataTypePtr & type, Dat NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema() { - size_t total_rows_read = 0; std::unordered_map names_to_types; std::vector names_order; /// Read data block by block and determine the type for each column - /// until max_rows_to_read_for_schema_inference is reached. - while (total_rows_read < format_settings.max_rows_to_read_for_schema_inference) + /// until max_rows_to_read/max_bytes_to_read is reached. + /// Note that we can exceed max_bytes_to_read to compete block parsing. + while (total_rows_read < max_rows_to_read && in.count() < max_bytes_to_read) { if (in.eof()) break; @@ -268,7 +270,7 @@ NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema() return result; } -DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows_to_read) +DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows) { /// Check for empty column. if (reader->checkColumnEnd()) @@ -279,7 +281,7 @@ DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String & do { /// If we reached max_rows_to_read, skip the rest part of this column. - if (rows_read == max_rows_to_read) + if (rows_read == max_rows) { reader->skipColumn(); break; diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h index 2babc0734f9..886c8841540 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h @@ -82,11 +82,19 @@ public: bool needContext() const override { return !hints_str.empty(); } void setContext(ContextPtr & ctx) override; + void setMaxRowsAndBytesToRead(size_t max_rows, size_t max_bytes) override + { + max_rows_to_read = max_rows; + max_bytes_to_read = max_bytes; + } + + size_t getNumRowsRead() const override { return total_rows_read; } + private: NamesAndTypesList readSchema() override; - /// Read whole column in the block (up to max_rows_to_read rows) and extract the data type. - DataTypePtr readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows_to_read); + /// Read whole column in the block (up to max_rows rows) and extract the data type. + DataTypePtr readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows); const FormatSettings format_settings; String hints_str; @@ -95,6 +103,10 @@ private: std::unique_ptr reader; Names column_names_from_settings; JSONInferenceInfo inference_info; + + size_t total_rows_read = 0; + size_t max_rows_to_read; + size_t max_bytes_to_read; }; } diff --git a/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp index ade18d21892..09df7beaa73 100644 --- a/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp @@ -53,7 +53,7 @@ void registerJSONCompactColumnsSchemaReader(FormatFactory & factory) ); factory.registerAdditionalInfoForSchemaCacheGetter("JSONCompactColumns", [](const FormatSettings & settings) { - auto result = getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON); + auto result = getAdditionalFormatInfoForAllRowBasedFormats(settings) + getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON); return result + fmt::format(", column_names_for_schema_inference={}", settings.column_names_for_schema_inference); }); } diff --git a/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.reference b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.reference new file mode 100644 index 00000000000..d45098ddc0f --- /dev/null +++ b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.reference @@ -0,0 +1 @@ +a Nullable(Int64) diff --git a/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql new file mode 100644 index 00000000000..9dbf176472d --- /dev/null +++ b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql @@ -0,0 +1,4 @@ +set input_format_max_rows_to_read_for_schema_inference=2; +desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=10; -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=20; +