From 557edbd172aac7a97aca68d37b204ce2c3f8f749 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 24 Mar 2022 12:54:12 +0000 Subject: [PATCH 01/17] Add some improvements and fixes in schema inference --- src/Core/Settings.h | 6 + src/DataTypes/DataTypeMap.cpp | 36 ++- src/DataTypes/DataTypeMap.h | 2 + src/Formats/EscapingRuleUtils.cpp | 294 +++++++++++++++--- src/Formats/EscapingRuleUtils.h | 5 +- src/Formats/FormatFactory.cpp | 8 +- src/Formats/FormatFactory.h | 2 +- src/Formats/FormatSettings.h | 9 +- src/Formats/ReadSchemaUtils.cpp | 30 +- src/Formats/ReadSchemaUtils.h | 8 +- src/IO/ReadHelpers.cpp | 11 + src/Processors/Formats/ISchemaReader.cpp | 22 +- src/Processors/Formats/ISchemaReader.h | 9 +- .../Formats/Impl/ArrowBlockInputFormat.cpp | 10 +- .../Formats/Impl/ArrowColumnToCHColumn.cpp | 70 ++++- .../Formats/Impl/ArrowColumnToCHColumn.h | 6 +- .../Formats/Impl/AvroRowInputFormat.cpp | 4 +- .../Formats/Impl/BinaryRowInputFormat.cpp | 4 +- .../Formats/Impl/CSVRowInputFormat.cpp | 12 +- .../Formats/Impl/CSVRowInputFormat.h | 3 +- .../Impl/CustomSeparatedRowInputFormat.cpp | 11 +- .../Impl/CustomSeparatedRowInputFormat.h | 3 +- .../Impl/JSONCompactEachRowRowInputFormat.cpp | 4 +- .../Impl/JSONEachRowRowInputFormat.cpp | 4 +- .../Formats/Impl/MsgPackRowInputFormat.cpp | 4 +- src/Processors/Formats/Impl/NativeFormat.cpp | 2 +- .../Formats/Impl/ORCBlockInputFormat.cpp | 8 +- .../Formats/Impl/ParquetBlockInputFormat.cpp | 8 +- .../Formats/Impl/RegexpRowInputFormat.cpp | 11 +- .../Formats/Impl/RegexpRowInputFormat.h | 3 +- .../Formats/Impl/TSKVRowInputFormat.cpp | 2 +- .../Impl/TabSeparatedRowInputFormat.cpp | 4 +- .../Formats/Impl/TemplateRowInputFormat.cpp | 12 +- .../Formats/Impl/TemplateRowInputFormat.h | 4 +- .../Formats/Impl/ValuesBlockInputFormat.cpp | 50 ++- .../Formats/Impl/ValuesBlockInputFormat.h | 4 +- .../RowInputFormatWithNamesAndTypes.cpp | 4 +- .../Formats/RowInputFormatWithNamesAndTypes.h | 2 +- .../02149_schema_inference.reference | 32 +- ...ma_inference_formats_with_schema.reference | 168 +++++----- ...arquet_nullable_schema_inference.reference | 40 +++ ...w_orc_parquet_nullable_schema_inference.sh | 21 ++ ...ead_null_type_to_nullable_column.reference | 1 + ...arrow_read_null_type_to_nullable_column.sh | 28 ++ ...column_names_in_shcmea_inference.reference | 8 + ...02244_column_names_in_shcmea_inference.sql | 12 + .../02245_parquet_skip_unknown_type.reference | 16 + .../02245_parquet_skip_unknown_type.sh | 18 ++ ...csv_best_effort_schema_inference.reference | 107 +++++++ ...46_tsv_csv_best_effort_schema_inference.sh | 220 +++++++++++++ 50 files changed, 1062 insertions(+), 300 deletions(-) create mode 100644 tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.reference create mode 100755 tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.sh create mode 100644 tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.reference create mode 100755 tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.sh create mode 100644 tests/queries/0_stateless/02244_column_names_in_shcmea_inference.reference create mode 100644 tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql create mode 100644 tests/queries/0_stateless/02245_parquet_skip_unknown_type.reference create mode 100755 tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh create mode 100644 tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference create mode 100755 tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.sh diff --git a/src/Core/Settings.h b/src/Core/Settings.h index ca2e9f12e66..86ea202fda7 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -631,6 +631,12 @@ class IColumn; M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \ M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \ M(UInt64, input_format_max_rows_to_read_for_schema_inference, 100, "The maximum rows of data to read for automatic schema inference", 0) \ + M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ + M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ + M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Parquet", 0) \ + M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format ORC", 0) \ + M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Arrow", 0) \ + M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \ \ M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic', 'best_effort' and 'best_effort_us'.", 0) \ M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \ diff --git a/src/DataTypes/DataTypeMap.cpp b/src/DataTypes/DataTypeMap.cpp index 41de17982aa..42ec739c33b 100644 --- a/src/DataTypes/DataTypeMap.cpp +++ b/src/DataTypes/DataTypeMap.cpp @@ -45,22 +45,7 @@ DataTypeMap::DataTypeMap(const DataTypePtr & key_type_, const DataTypePtr & valu void DataTypeMap::assertKeyType() const { - bool type_error = false; - if (key_type->getTypeId() == TypeIndex::LowCardinality) - { - const auto & low_cardinality_data_type = assert_cast(*key_type); - if (!isStringOrFixedString(*(low_cardinality_data_type.getDictionaryType()))) - type_error = true; - } - else if (!key_type->isValueRepresentedByInteger() - && !isStringOrFixedString(*key_type) - && !WhichDataType(key_type).isNothing() - && !WhichDataType(key_type).isUUID()) - { - type_error = true; - } - - if (type_error) + if (!checkKeyType(key_type)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Type of Map key must be a type, that can be represented by integer or String or FixedString (possibly LowCardinality) or UUID," " but {} given", key_type->getName()); @@ -102,6 +87,25 @@ bool DataTypeMap::equals(const IDataType & rhs) const return nested->equals(*rhs_map.nested); } +bool DataTypeMap::checkKeyType(DataTypePtr key_type) +{ + if (key_type->getTypeId() == TypeIndex::LowCardinality) + { + const auto & low_cardinality_data_type = assert_cast(*key_type); + if (!isStringOrFixedString(*(low_cardinality_data_type.getDictionaryType()))) + return false; + } + else if (!key_type->isValueRepresentedByInteger() + && !isStringOrFixedString(*key_type) + && !WhichDataType(key_type).isNothing() + && !WhichDataType(key_type).isUUID()) + { + return false; + } + + return true; +} + static DataTypePtr create(const ASTPtr & arguments) { if (!arguments || arguments->children.size() != 2) diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index 65bdd93ca4d..479008031fe 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -48,6 +48,8 @@ public: SerializationPtr doGetDefaultSerialization() const override; + static bool checkKeyType(DataTypePtr key_type); + private: void assertKeyType() const; }; diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index b0ea10abdb6..08b34ebb0fc 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -5,12 +5,17 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include #include -#include +#include #include -#include -#include + namespace DB { @@ -138,7 +143,8 @@ bool deserializeFieldByEscapingRule( serialization->deserializeTextRaw(column, buf, format_settings); break; default: - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Escaping rule {} is not suitable for deserialization", escapingRuleToString(escaping_rule)); + throw Exception( + ErrorCodes::BAD_ARGUMENTS, "Escaping rule {} is not suitable for deserialization", escapingRuleToString(escaping_rule)); } return read; } @@ -176,7 +182,8 @@ void serializeFieldByEscapingRule( } } -void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +void writeStringByEscapingRule( + const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { switch (escaping_rule) { @@ -249,85 +256,270 @@ String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule e return readByEscapingRule(buf, escaping_rule, format_settings); } -static bool evaluateConstantExpressionFromString(const StringRef & field, DataTypePtr & type, ContextPtr context) +static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) { - if (!context) - throw Exception(ErrorCodes::LOGICAL_ERROR, "You must provide context to evaluate constant expression"); + if (buf.eof()) + return nullptr; - ParserExpression parser; - Expected expected; - Tokens tokens(field.data, field.data + field.size); - IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); - ASTPtr ast; - - /// FIXME: Our parser cannot parse maps in the form of '{key : value}' that is used in text formats. - bool parsed = parser.parse(token_iterator, ast, expected); - if (!parsed || !token_iterator->isEnd()) - return false; - - try + /// Array + if (checkChar('[', buf)) { - std::pair result = evaluateConstantExpression(ast, context); - type = generalizeDataType(result.second); - return true; + skipWhitespaceIfAny(buf); + + DataTypes nested_types; + bool first = true; + while (!buf.eof() && *buf.position() != ']') + { + if (!first) + { + skipWhitespaceIfAny(buf); + if (!checkChar(',', buf)) + return nullptr; + skipWhitespaceIfAny(buf); + } + else + first = false; + + auto nested_type = determineDataTypeForSingleFieldImpl(buf); + if (!nested_type) + return nullptr; + + nested_types.push_back(nested_type); + } + + if (buf.eof()) + return nullptr; + + ++buf.position(); + + if (nested_types.empty()) + return std::make_shared(std::make_shared()); + + auto least_supertype = tryGetLeastSupertype(nested_types); + if (!least_supertype) + return nullptr; + + return std::make_shared(least_supertype); } - catch (...) + + /// Tuple + if (checkChar('(', buf)) { - return false; + skipWhitespaceIfAny(buf); + + DataTypes nested_types; + bool first = true; + while (!buf.eof() && *buf.position() != ')') + { + if (!first) + { + skipWhitespaceIfAny(buf); + if (!checkChar(',', buf)) + return nullptr; + skipWhitespaceIfAny(buf); + } + else + first = false; + + auto nested_type = determineDataTypeForSingleFieldImpl(buf); + if (!nested_type) + return nullptr; + + nested_types.push_back(nested_type); + } + + if (buf.eof() || nested_types.empty()) + return nullptr; + + ++buf.position(); + + return std::make_shared(nested_types); } + + /// Map + if (checkChar('{', buf)) + { + skipWhitespaceIfAny(buf); + + DataTypes key_types; + DataTypes value_types; + bool first = true; + while (!buf.eof() && *buf.position() != '}') + { + if (!first) + { + skipWhitespaceIfAny(buf); + if (!checkChar(',', buf)) + return nullptr; + skipWhitespaceIfAny(buf); + } + else + first = false; + + auto key_type = determineDataTypeForSingleFieldImpl(buf); + if (!key_type) + return nullptr; + + key_types.push_back(key_type); + + skipWhitespaceIfAny(buf); + if (!checkChar(':', buf)) + return nullptr; + skipWhitespaceIfAny(buf); + + auto value_type = determineDataTypeForSingleFieldImpl(buf); + if (!value_type) + return nullptr; + + value_types.push_back(value_type); + } + + if (buf.eof()) + return nullptr; + + ++buf.position(); + skipWhitespaceIfAny(buf); + + if (key_types.empty()) + return std::make_shared(std::make_shared(), std::make_shared()); + + auto key_least_supertype = tryGetLeastSupertype(key_types); + + auto value_least_supertype = tryGetLeastSupertype(value_types); + if (!key_least_supertype || !value_least_supertype) + return nullptr; + + if (!DataTypeMap::checkKeyType(key_least_supertype)) + return nullptr; + + return std::make_shared(key_least_supertype, value_least_supertype); + } + + /// String + if (*buf.position() == '\'') + { + ++buf.position(); + while (!buf.eof()) + { + char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end()); + buf.position() = next_pos; + + if (!buf.hasPendingData()) + continue; + + if (*buf.position() == '\'') + break; + + if (*buf.position() == '\\') + ++buf.position(); + } + + if (buf.eof()) + return nullptr; + + ++buf.position(); + return std::make_shared(); + } + + /// Bool + if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf)) + return DataTypeFactory::instance().get("Bool"); + + /// Null + if (checkStringCaseInsensitive("NULL", buf)) + return std::make_shared(); + + Float64 tmp; + if (tryReadFloatText(tmp, buf)) + return std::make_shared(); + + return nullptr; } -DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context) +static DataTypePtr determineDataTypeForSingleField(ReadBuffer & buf) +{ + return makeNullableRecursivelyAndCheckForNothing(determineDataTypeForSingleFieldImpl(buf)); +} + +DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule) { switch (escaping_rule) { case FormatSettings::EscapingRule::Quoted: { - DataTypePtr type; - bool parsed = evaluateConstantExpressionFromString(field, type, context); - return parsed ? type : nullptr; + ReadBufferFromString buf(field); + auto type = determineDataTypeForSingleField(buf); + return buf.eof() ? type : nullptr; } case FormatSettings::EscapingRule::JSON: return getDataTypeFromJSONField(field); case FormatSettings::EscapingRule::CSV: { + if (!format_settings.csv.input_format_use_best_effort_in_schema_inference) + return makeNullable(std::make_shared()); + if (field.empty() || field == format_settings.csv.null_representation) return nullptr; if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) - return std::make_shared(); + return DataTypeFactory::instance().get("Nullable(Bool)"); - DataTypePtr type; - bool parsed; - if (field[0] == '\'' || field[0] == '"') + if (field.size() > 1 && ((field.front() == '\'' && field.back() == '\'') || (field.front() == '"' && field.back() == '"'))) { - /// Try to evaluate expression inside quotes. - parsed = evaluateConstantExpressionFromString(StringRef(field.data() + 1, field.size() - 2), type, context); - /// If it's a number in quotes we determine it as a string. - if (parsed && type && isNumber(removeNullable(type))) - return makeNullable(std::make_shared()); - } - else - parsed = evaluateConstantExpressionFromString(field, type, context); + auto s = std::string_view(field.data() + 1, field.size() - 2); - /// If we couldn't parse an expression, determine it as a string. - return parsed ? type : makeNullable(std::make_shared()); + ReadBufferFromString buf(std::string_view(field.data() + 1, field.size() - 2)); + /// Try to determine the type of value inside quotes + auto type = determineDataTypeForSingleField(buf); + + if (!type) + return nullptr; + + /// If it's a number or tuple in quotes or there is some unread data in buffer, we determine it as a string. + if (isNumber(removeNullable(type)) || isTuple(type) || !buf.eof()) + return makeNullable(std::make_shared()); + + return type; + } + + /// Case when CSV value is not in quotes. Check if it's a number, and if not, determine it's as a string. + ReadBufferFromString buf(field); + Float64 tmp; + if (tryReadFloatText(tmp, buf) && buf.eof()) + return makeNullable(std::make_shared()); + + return makeNullable(std::make_shared()); } case FormatSettings::EscapingRule::Raw: [[fallthrough]]; case FormatSettings::EscapingRule::Escaped: - /// TODO: Try to use some heuristics here to determine the type of data. - return field.empty() ? nullptr : makeNullable(std::make_shared()); + { + if (!format_settings.tsv.input_format_use_best_effort_in_schema_inference) + return makeNullable(std::make_shared()); + + if (field.empty() || field == format_settings.tsv.null_representation) + return nullptr; + + if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) + return DataTypeFactory::instance().get("Nullable(Bool)"); + + ReadBufferFromString buf(field); + auto type = determineDataTypeForSingleField(buf); + if (!buf.eof()) + return makeNullable(std::make_shared()); + + return type; + } default: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", escapingRuleToString(escaping_rule)); } } -DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context) +DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule) { DataTypes data_types; data_types.reserve(fields.size()); for (const auto & field : fields) - data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule, context)); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule)); return data_types; } @@ -344,4 +536,12 @@ DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escap } } +DataTypes getDefaultDataTypeForEscapingRules(const std::vector & escaping_rules) +{ + DataTypes data_types; + for (const auto & rule : escaping_rules) + data_types.push_back(getDefaultDataTypeForEscapingRule(rule)); + return data_types; +} + } diff --git a/src/Formats/EscapingRuleUtils.h b/src/Formats/EscapingRuleUtils.h index 10147b29ad6..3c7c768c003 100644 --- a/src/Formats/EscapingRuleUtils.h +++ b/src/Formats/EscapingRuleUtils.h @@ -49,9 +49,10 @@ String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule es /// expression inside quotes as a constant expression, and if it fails or /// the result is a number (we don't parse numbers in quotes) we treat it as a String. /// - For TSV and TSVRaw we treat each field as a String (TODO: try to use some tweaks and heuristics here) -DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr); -DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr); +DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule); +DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule); DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule); +DataTypes getDefaultDataTypeForEscapingRules(const std::vector & escaping_rules); } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 08554cf7e07..f8636768d00 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -65,6 +65,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number; format_settings.csv.null_representation = settings.format_csv_null_representation; format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv; + format_settings.csv.input_format_use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; @@ -94,6 +95,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size; format_settings.parquet.import_nested = settings.input_format_parquet_import_nested; format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns; + format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference; format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8; format_settings.pretty.color = settings.output_format_pretty_color; format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width; @@ -114,6 +116,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default; format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number; format_settings.tsv.null_representation = settings.format_tsv_null_representation; + format_settings.tsv.input_format_use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference; format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals; format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; @@ -123,15 +126,18 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary; format_settings.arrow.import_nested = settings.input_format_arrow_import_nested; format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns; + format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference; format_settings.orc.import_nested = settings.input_format_orc_import_nested; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; + format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.seekable_read = settings.input_format_allow_seeks; format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns; format_settings.msgpack.output_uuid_representation = settings.output_format_msgpack_uuid_representation; format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference; + format_settings.column_names_for_schema_inference = settings.column_names_for_schema_inference; /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context if (format_settings.schema.is_server) @@ -366,7 +372,7 @@ SchemaReaderPtr FormatFactory::getSchemaReader( throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR); auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); - return schema_reader_creator(buf, format_settings, context); + return schema_reader_creator(buf, format_settings); } ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader( diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index 344dabd3f4d..2f53da3bdff 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -97,7 +97,7 @@ private: /// The checker should return true if format support append. using AppendSupportChecker = std::function; - using SchemaReaderCreator = std::function; + using SchemaReaderCreator = std::function; using ExternalSchemaReaderCreator = std::function; struct Creators diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 4881c1a43c8..6ecd04536a6 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -37,6 +37,8 @@ struct FormatSettings bool seekable_read = true; UInt64 max_rows_to_read_for_schema_inference = 100; + String column_names_for_schema_inference = ""; + enum class DateTimeInputFormat { Basic, /// Default format for fast parsing: YYYY-MM-DD hh:mm:ss (ISO-8601 without fractional part and timezone) or NNNNNNNNNN unix timestamp. @@ -75,6 +77,7 @@ struct FormatSettings bool low_cardinality_as_dictionary = false; bool import_nested = false; bool allow_missing_columns = false; + bool skip_columns_with_unsupported_types_in_schema_inference = false; } arrow; struct @@ -101,6 +104,7 @@ struct FormatSettings bool input_format_arrays_as_nested_csv = false; String null_representation = "\\N"; char tuple_delimiter = ','; + bool input_format_use_best_effort_in_schema_inference = true; } csv; struct HiveText @@ -108,7 +112,7 @@ struct FormatSettings char fields_delimiter = '\x01'; char collection_items_delimiter = '\x02'; char map_keys_delimiter = '\x03'; - Names input_field_names; + Names input_field_names = {""}; } hive_text; struct Custom @@ -137,6 +141,7 @@ struct FormatSettings UInt64 row_group_size = 1000000; bool import_nested = false; bool allow_missing_columns = false; + bool skip_columns_with_unsupported_types_in_schema_inference = false; } parquet; struct Pretty @@ -203,6 +208,7 @@ struct FormatSettings bool crlf_end_of_line = false; String null_representation = "\\N"; bool input_format_enum_as_number = false; + bool input_format_use_best_effort_in_schema_inference = true; } tsv; struct @@ -217,6 +223,7 @@ struct FormatSettings bool import_nested = false; bool allow_missing_columns = false; int64_t row_batch_size = 100'000; + bool skip_columns_with_unsupported_types_in_schema_inference = false; } orc; /// For capnProto format we should determine how to diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 559fac4cfaa..18080b0e896 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -65,8 +65,11 @@ ColumnsDescription readSchemaFromFormat(const String & format_name, const std::o return readSchemaFromFormat(format_name, format_settings, read_buffer_creator, context, buf_out); } -DataTypePtr generalizeDataType(DataTypePtr type) +DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type) { + if (!type) + return nullptr; + WhichDataType which(type); if (which.isNothing()) @@ -75,16 +78,13 @@ DataTypePtr generalizeDataType(DataTypePtr type) if (which.isNullable()) { const auto * nullable_type = assert_cast(type.get()); - return generalizeDataType(nullable_type->getNestedType()); + return makeNullableRecursivelyAndCheckForNothing(nullable_type->getNestedType()); } - if (isNumber(type)) - return makeNullable(std::make_shared()); - if (which.isArray()) { const auto * array_type = assert_cast(type.get()); - auto nested_type = generalizeDataType(array_type->getNestedType()); + auto nested_type = makeNullableRecursivelyAndCheckForNothing(array_type->getNestedType()); return nested_type ? std::make_shared(nested_type) : nullptr; } @@ -94,7 +94,7 @@ DataTypePtr generalizeDataType(DataTypePtr type) DataTypes nested_types; for (const auto & element : tuple_type->getElements()) { - auto nested_type = generalizeDataType(element); + auto nested_type = makeNullableRecursivelyAndCheckForNothing(element); if (!nested_type) return nullptr; nested_types.push_back(nested_type); @@ -105,19 +105,27 @@ DataTypePtr generalizeDataType(DataTypePtr type) if (which.isMap()) { const auto * map_type = assert_cast(type.get()); - auto key_type = removeNullable(generalizeDataType(map_type->getKeyType())); - auto value_type = generalizeDataType(map_type->getValueType()); - return key_type && value_type ? std::make_shared(key_type, value_type) : nullptr; + auto key_type = makeNullableRecursivelyAndCheckForNothing(map_type->getKeyType()); + auto value_type = makeNullableRecursivelyAndCheckForNothing(map_type->getValueType()); + return key_type && value_type ? std::make_shared(removeNullable(key_type), value_type) : nullptr; } if (which.isLowCarnality()) { const auto * lc_type = assert_cast(type.get()); - auto nested_type = generalizeDataType(lc_type->getDictionaryType()); + auto nested_type = makeNullableRecursivelyAndCheckForNothing(lc_type->getDictionaryType()); return nested_type ? std::make_shared(nested_type) : nullptr; } return makeNullable(type); } +NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header) +{ + NamesAndTypesList result; + for (auto & [name, type] : header.getNamesAndTypesList()) + result.emplace_back(name, makeNullableRecursivelyAndCheckForNothing(type)); + return result; +} + } diff --git a/src/Formats/ReadSchemaUtils.h b/src/Formats/ReadSchemaUtils.h index 4446393a581..ea8ebbad4c0 100644 --- a/src/Formats/ReadSchemaUtils.h +++ b/src/Formats/ReadSchemaUtils.h @@ -29,14 +29,16 @@ ColumnsDescription readSchemaFromFormat( ContextPtr context, std::unique_ptr & buf_out); -/// Convert type to the most general type: -/// - IntN, UIntN, FloatN, Decimal -> Float64 +/// Make type Nullable recursively: /// - Type -> Nullable(type) /// - Array(Type) -> Array(Nullable(Type)) /// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN)) /// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType)) /// - LowCardinality(Type) -> LowCardinality(Nullable(Type)) /// If type is Nothing or one of the nested types is Nothing, return nullptr. -DataTypePtr generalizeDataType(DataTypePtr type); +DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type); +/// Call makeNullableRecursivelyAndCheckForNothing for all types +/// in the block and return names and types. +NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header); } diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index e086f16be54..98a33612aa8 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -1366,6 +1366,7 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) /// - Tuples: (...) /// - Maps: {...} /// - NULL + /// - Bool: true/false /// - Number: integer, float, decimal. if (*buf.position() == '\'') @@ -1394,6 +1395,16 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) s.append("NaN"); } } + else if (checkCharCaseInsensitive('t', buf)) + { + assertStringCaseInsensitive("rue", buf); + s.append("true"); + } + else if (checkCharCaseInsensitive('f', buf)) + { + assertStringCaseInsensitive("alse", buf); + s.append("false"); + } else { /// It's an integer, float or decimal. They all can be parsed as float. diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 096e39a2893..17db8865310 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace DB { @@ -10,9 +11,16 @@ namespace ErrorCodes extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } -IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_) - : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_) +IRowSchemaReader::IRowSchemaReader( + ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_, const DataTypes & default_types_) + : ISchemaReader(in_) + , max_rows_to_read(format_settings.max_rows_to_read_for_schema_inference), default_type(default_type_), default_types(default_types_) { + if (!format_settings.column_names_for_schema_inference.empty()) + { + /// column_names_for_schema_inference is a string in format 'column1,column2,column3,...' + boost::split(column_names, format_settings.column_names_for_schema_inference, boost::is_any_of(",")); + } } NamesAndTypesList IRowSchemaReader::readSchema() @@ -43,6 +51,8 @@ NamesAndTypesList IRowSchemaReader::readSchema() { if (default_type) data_types[i] = default_type; + else if (!default_types.empty() && i < default_types.size() && default_types[i]) + data_types[i] = default_types[i]; else throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, @@ -74,14 +84,16 @@ NamesAndTypesList IRowSchemaReader::readSchema() /// Check that we could determine the type of this column. if (!data_types[i]) { - if (!default_type) + if (default_type) + data_types[i] = default_type; + else if (!default_types.empty() && i < default_types.size() && default_types[i]) + data_types[i] = default_types[i]; + else throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum " "number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference", max_rows_to_read); - - data_types[i] = default_type; } result.emplace_back(column_names[i], data_types[i]); } diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 2d35809e26a..cd0b552c021 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -27,12 +27,14 @@ protected: /// Base class for schema inference for formats that read data row by row. /// It reads data row by row (up to max_rows_to_read), determines types of columns /// for each row and compare them with types from the previous rows. If some column -/// contains values with different types in different rows, the default type will be -/// used for this column or the exception will be thrown (if default type is not set). +/// contains values with different types in different rows, the default type +/// (from argument default_type_) will be used for this column or the exception +/// will be thrown (if default type is not set). If different columns have different +/// default types, you can provide them by default_types_ argument. class IRowSchemaReader : public ISchemaReader { public: - IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); + IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_ = nullptr, const DataTypes & default_types_ = {}); NamesAndTypesList readSchema() override; protected: @@ -47,6 +49,7 @@ protected: private: size_t max_rows_to_read; DataTypePtr default_type; + DataTypes default_types; std::vector column_names; }; diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp index cf5cfa681a1..42c68e4654b 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp @@ -3,6 +3,7 @@ #if USE_ARROW #include +#include #include #include #include @@ -167,8 +168,9 @@ NamesAndTypesList ArrowSchemaReader::readSchema() schema = createFileReader(in, format_settings, is_stopped)->schema(); } - auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, stream ? "ArrowStream" : "Arrow"); - return header.getNamesAndTypesList(); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader( + *schema, stream ? "ArrowStream" : "Arrow", format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference); + return getNamesAndRecursivelyNullableTypes(header); } void registerInputFormatArrow(FormatFactory & factory) @@ -198,13 +200,13 @@ void registerArrowSchemaReader(FormatFactory & factory) { factory.registerSchemaReader( "Arrow", - [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, false, settings); }); factory.registerSchemaReader( "ArrowStream", - [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, true, settings); });} diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 14c81a0d90d..8f5bb205bef 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -26,11 +27,13 @@ #include #include #include +#include #include #include #include #include #include +#include /// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn. #define FOR_ARROW_NUMERIC_TYPES(M) \ @@ -328,12 +331,17 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( const std::string & format_name, bool is_nullable, std::unordered_map> & dictionary_values, - bool read_ints_as_dates) + bool read_ints_as_dates, + bool allow_null_type, + bool skip_columns_with_unsupported_types, + bool & skipped) { if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST && arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT) { - auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates); + auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); + if (skipped) + return {}; auto nullmap_column = readByteMapFromArrowColumn(arrow_column); auto nullable_type = std::make_shared(std::move(nested_column.type)); auto nullable_column = ColumnNullable::create(nested_column.column, nullmap_column); @@ -378,7 +386,10 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::MAP: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); + if (skipped) + return {}; + auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); const auto * tuple_column = assert_cast(nested_column.column.get()); @@ -390,7 +401,9 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::LIST: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); + if (skipped) + return {}; auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); auto array_column = ColumnArray::create(nested_column.column, offsets_column); auto array_type = std::make_shared(nested_column.type); @@ -415,7 +428,9 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( for (int i = 0; i != arrow_struct_type->num_fields(); ++i) { auto nested_arrow_column = std::make_shared(nested_arrow_columns[i]); - auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates); + auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); + if (skipped) + return {}; tuple_elements.emplace_back(std::move(element.column)); tuple_types.emplace_back(std::move(element.type)); tuple_names.emplace_back(std::move(element.name)); @@ -438,7 +453,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( dict_array.emplace_back(dict_chunk.dictionary()); } auto arrow_dict_column = std::make_shared(dict_array); - auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); + auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); /// We should convert read column to ColumnUnique. auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn(); @@ -468,9 +483,33 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( # undef DISPATCH // TODO: read JSON as a string? // TODO: read UUID as a string? + case arrow::Type::NA: + { + if (allow_null_type) + { + auto type = std::make_shared(); + auto column = ColumnNothing::create(arrow_column->length()); + return {std::move(column), type, column_name}; + } + [[fallthrough]]; + } default: - throw Exception(ErrorCodes::UNKNOWN_TYPE, - "Unsupported {} type '{}' of an input column '{}'.", format_name, arrow_column->type()->name(), column_name); + { + if (skip_columns_with_unsupported_types) + { + skipped = true; + return {}; + } + + throw Exception( + ErrorCodes::UNKNOWN_TYPE, + "Unsupported {} type '{}' of an input column '{}'. If it happens during schema inference and you want to skip columns with " + "unsupported types, you can enable setting input_format_{}_skip_columns_with_unsupported_types_in_schema_inference", + format_name, + arrow_column->type()->name(), + column_name, + boost::algorithm::to_lower_copy(format_name)); + } } } @@ -484,7 +523,8 @@ static void checkStatus(const arrow::Status & status, const String & column_name throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()}; } -Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, const Block * hint_header) + +Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, bool skip_columns_with_unsupported_types, const Block * hint_header) { ColumnsWithTypeAndName sample_columns; std::unordered_set nested_table_names; @@ -508,9 +548,10 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, arrow::ArrayVector array_vector = {arrow_array}; auto arrow_column = std::make_shared(array_vector); std::unordered_map> dict_values; - ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false); - - sample_columns.emplace_back(std::move(sample_column)); + bool skipped = false; + ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false, false, skip_columns_with_unsupported_types, skipped); + if (!skipped) + sample_columns.emplace_back(std::move(sample_column)); } return Block(std::move(sample_columns)); } @@ -544,6 +585,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & UInt64 num_rows = name_to_column_ptr.begin()->second->length(); columns_list.reserve(header.rows()); std::unordered_map nested_tables; + bool skipped = false; for (size_t column_i = 0, columns = header.columns(); column_i < columns; ++column_i) { const ColumnWithTypeAndName & header_column = header.getByPosition(column_i); @@ -558,7 +600,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & if (!nested_tables.contains(nested_table_name)) { std::shared_ptr arrow_column = name_to_column_ptr[nested_table_name]; - ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)}; + ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true, true, false, skipped)}; Block block(cols); nested_tables[nested_table_name] = std::make_shared(Nested::flatten(block)); } @@ -586,7 +628,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & if (read_from_nested) column = nested_tables[nested_table_name]->getByName(header_column.name); else - column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true); + column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true, true, false, skipped); try { diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index cf4f6bb3ff3..cc5852691e0 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -36,7 +36,11 @@ public: /// Transform arrow schema to ClickHouse header. If hint_header is provided, /// we will skip columns in schema that are not in hint_header. - static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, const Block * hint_header = nullptr); + static Block arrowSchemaToCHHeader( + const arrow::Schema & schema, + const std::string & format_name, + bool skip_columns_with_unsupported_types = false, + const Block * hint_header = nullptr); private: const Block & header; diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index a372df41344..29429650c19 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -924,12 +924,12 @@ void registerInputFormatAvro(FormatFactory & factory) void registerAvroSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, false, settings); }); - factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, true, settings); }); diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp index 6918220feb4..d3de2fbf494 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp @@ -95,7 +95,7 @@ void BinaryFormatReader::skipField(size_t file_column) } BinaryWithNamesAndTypesSchemaReader::BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) - : FormatWithNamesAndTypesSchemaReader(in_, 0, true, true, &reader), reader(in_, format_settings_) + : FormatWithNamesAndTypesSchemaReader(in_, format_settings_, true, true, &reader), reader(in_, format_settings_) { } @@ -119,7 +119,7 @@ void registerInputFormatRowBinary(FormatFactory & factory) void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, settings); }); diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 216ec6b295a..f246d5c0a35 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include @@ -259,16 +258,15 @@ bool CSVFormatReader::readField( } -CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_) +CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_) : FormatWithNamesAndTypesSchemaReader( in_, - format_setting_.max_rows_to_read_for_schema_inference, + format_setting_, with_names_, with_types_, &reader, getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV)) , reader(in_, format_setting_) - , context(context_) { } @@ -279,7 +277,7 @@ DataTypes CSVSchemaReader::readRowAndGetDataTypes() return {}; auto fields = reader.readRow(); - return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV, context); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV); } @@ -382,9 +380,9 @@ void registerCSVSchemaReader(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings) { - return std::make_shared(buf, with_names, with_types, settings, context); + return std::make_shared(buf, with_names, with_types, settings); }); }; diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index ad9f6c4e492..ee45264d573 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -74,13 +74,12 @@ public: class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader { public: - CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_); + CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_); private: DataTypes readRowAndGetDataTypes() override; CSVFormatReader reader; - ContextPtr context; }; } diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index d2e0d6e21a9..74c5fb1945a 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -289,17 +289,16 @@ void CustomSeparatedFormatReader::setReadBuffer(ReadBuffer & in_) } CustomSeparatedSchemaReader::CustomSeparatedSchemaReader( - ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_) + ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_) : FormatWithNamesAndTypesSchemaReader( buf, - format_setting_.max_rows_to_read_for_schema_inference, + format_setting_, with_names_, with_types_, &reader, getDefaultDataTypeForEscapingRule(format_setting_.custom.escaping_rule)) , buf(in_) , reader(buf, ignore_spaces_, updateFormatSettings(format_setting_)) - , context(context_) { } @@ -315,7 +314,7 @@ DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypes() first_row = false; auto fields = reader.readRow(); - return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), context); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule()); } void registerInputFormatCustomSeparated(FormatFactory & factory) @@ -343,9 +342,9 @@ void registerCustomSeparatedSchemaReader(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings) { - return std::make_shared(buf, with_names, with_types, ignore_spaces, settings, context); + return std::make_shared(buf, with_names, with_types, ignore_spaces, settings); }); }; diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h index a2f4509d307..d9e62a1b8e9 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h @@ -92,14 +92,13 @@ private: class CustomSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader { public: - CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_); + CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_); private: DataTypes readRowAndGetDataTypes() override; PeekableReadBuffer buf; CustomSeparatedFormatReader reader; - ContextPtr context; bool first_row = true; }; diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index c087749d8d8..15b31eec11a 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -182,7 +182,7 @@ bool JSONCompactEachRowFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & } JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) - : FormatWithNamesAndTypesSchemaReader(in_, format_settings_.max_rows_to_read_for_schema_inference, with_names_, with_types_, &reader), reader(in_, yield_strings_, format_settings_) + : FormatWithNamesAndTypesSchemaReader(in_, format_settings_, with_names_, with_types_, &reader), reader(in_, yield_strings_, format_settings_) { } @@ -231,7 +231,7 @@ void registerJSONCompactEachRowSchemaReader(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, with_names, with_types, json_strings, settings); }); diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 549fd7a6113..19a97f50984 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -382,12 +382,12 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory void registerJSONEachRowSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_unique(buf, false, settings); }); - factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_unique(buf, true, settings); }); diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index 607e6f36767..36e7a56ebea 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -414,7 +414,7 @@ void MsgPackRowInputFormat::setReadBuffer(ReadBuffer & in_) } MsgPackSchemaReader::MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) - : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns) + : IRowSchemaReader(buf, format_settings_), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns) { if (!number_of_columns) throw Exception(ErrorCodes::BAD_ARGUMENTS, "You must specify setting input_format_msgpack_number_of_columns to extract table schema from MsgPack data"); @@ -535,7 +535,7 @@ void registerInputFormatMsgPack(FormatFactory & factory) void registerMsgPackSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, settings); }); diff --git a/src/Processors/Formats/Impl/NativeFormat.cpp b/src/Processors/Formats/Impl/NativeFormat.cpp index bd95cfd6376..c1dc60022f5 100644 --- a/src/Processors/Formats/Impl/NativeFormat.cpp +++ b/src/Processors/Formats/Impl/NativeFormat.cpp @@ -133,7 +133,7 @@ void registerOutputFormatNative(FormatFactory & factory) void registerNativeSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &, ContextPtr) + factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &) { return std::make_shared(buf); }); diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index aa9f7874ae8..e93897edfbe 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -3,6 +3,7 @@ #if USE_ORC #include +#include #include #include #include @@ -187,8 +188,9 @@ NamesAndTypesList ORCSchemaReader::readSchema() std::shared_ptr schema; std::atomic is_stopped = 0; getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped); - auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "ORC"); - return header.getNamesAndTypesList(); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader( + *schema, "ORC", format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference); + return getNamesAndRecursivelyNullableTypes(header); } void registerInputFormatORC(FormatFactory & factory) @@ -209,7 +211,7 @@ void registerORCSchemaReader(FormatFactory & factory) { factory.registerSchemaReader( "ORC", - [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, settings); } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 548bf0138f5..f3d81822297 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -4,6 +4,7 @@ #if USE_PARQUET #include +#include #include #include #include @@ -186,8 +187,9 @@ NamesAndTypesList ParquetSchemaReader::readSchema() std::shared_ptr schema; std::atomic is_stopped = 0; getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped); - auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "Parquet"); - return header.getNamesAndTypesList(); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader( + *schema, "Parquet", format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference); + return getNamesAndRecursivelyNullableTypes(header); } void registerInputFormatParquet(FormatFactory & factory) @@ -208,7 +210,7 @@ void registerParquetSchemaReader(FormatFactory & factory) { factory.registerSchemaReader( "Parquet", - [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, settings); } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index 4754b70d375..f18b6b0aaab 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -128,15 +128,14 @@ void RegexpRowInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } -RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_) +RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : IRowSchemaReader( buf, - format_settings_.max_rows_to_read_for_schema_inference, + format_settings_, getDefaultDataTypeForEscapingRule(format_settings_.regexp.escaping_rule)) , format_settings(format_settings_) , field_extractor(format_settings) , buf(in_) - , context(context_) { } @@ -152,7 +151,7 @@ DataTypes RegexpSchemaReader::readRowAndGetDataTypes() for (size_t i = 0; i != field_extractor.getMatchedFieldsSize(); ++i) { String field(field_extractor.getField(i)); - data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule, context)); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule)); } return data_types; @@ -203,9 +202,9 @@ void registerFileSegmentationEngineRegexp(FormatFactory & factory) void registerRegexpSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings) { - return std::make_shared(buf, settings, context); + return std::make_shared(buf, settings); }); } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.h b/src/Processors/Formats/Impl/RegexpRowInputFormat.h index 04f24bbb3e4..3cc6a3192fd 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.h +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.h @@ -76,7 +76,7 @@ private: class RegexpSchemaReader : public IRowSchemaReader { public: - RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_); + RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings); private: DataTypes readRowAndGetDataTypes() override; @@ -85,7 +85,6 @@ private: const FormatSettings format_settings; RegexpFieldExtractor field_extractor; PeekableReadBuffer buf; - ContextPtr context; }; } diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index f63d6fa9c46..4c50e4d9b03 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -280,7 +280,7 @@ void registerInputFormatTSKV(FormatFactory & factory) } void registerTSKVSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, settings); }); diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index bb844ec68ea..b6c9438a57c 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -235,7 +235,7 @@ TabSeparatedSchemaReader::TabSeparatedSchemaReader( ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesSchemaReader( in_, - format_settings_.max_rows_to_read_for_schema_inference, + format_settings_, with_names_, with_types_, &reader, @@ -280,7 +280,7 @@ void registerTSVSchemaReader(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, with_names, with_types, is_raw, settings); }); diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 06d6ba06bcc..a1f70730b39 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -453,14 +453,12 @@ TemplateSchemaReader::TemplateSchemaReader( const ParsedTemplateFormatString & format_, const ParsedTemplateFormatString & row_format_, std::string row_between_delimiter, - const FormatSettings & format_settings_, - ContextPtr context_) - : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference) + const FormatSettings & format_settings_) + : IRowSchemaReader(buf, format_settings_, nullptr, getDefaultDataTypeForEscapingRules(row_format_.escaping_rules)) , buf(in_) , format(format_) , row_format(row_format_) , format_settings(format_settings_) - , context(context_) , format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings) { setColumnNames(row_format.column_names); @@ -489,7 +487,7 @@ DataTypes TemplateSchemaReader::readRowAndGetDataTypes() format_settings.csv.delimiter = row_format.delimiters[i + 1].empty() ? format_settings.csv.delimiter : row_format.delimiters[i + 1].front(); field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings); - data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], context)); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i])); } format_reader.skipRowEndDelimiter(); @@ -564,12 +562,12 @@ void registerTemplateSchemaReader(FormatFactory & factory) { for (bool ignore_spaces : {false, true}) { - factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings) { size_t index = 0; auto idx_getter = [&](const String &) -> std::optional { return index++; }; auto row_format = fillRowFormat(settings, idx_getter, false); - return std::make_shared(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings, context); + return std::make_shared(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings); }); } } diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/src/Processors/Formats/Impl/TemplateRowInputFormat.h index b5ced707ace..ab7043f057e 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -116,8 +116,7 @@ public: const ParsedTemplateFormatString & format_, const ParsedTemplateFormatString & row_format_, std::string row_between_delimiter, - const FormatSettings & format_settings_, - ContextPtr context_); + const FormatSettings & format_settings_); DataTypes readRowAndGetDataTypes() override; @@ -126,7 +125,6 @@ private: const ParsedTemplateFormatString format; const ParsedTemplateFormatString row_format; FormatSettings format_settings; - ContextPtr context; TemplateFormatReader format_reader; bool first_row = true; }; diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index bf8feb077ed..e8b4c69bd19 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -571,8 +572,8 @@ void ValuesBlockInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } -ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_) - : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), context(context_) +ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : IRowSchemaReader(buf, format_settings_), buf(in_), format_settings(format_settings_) { } @@ -589,38 +590,25 @@ DataTypes ValuesSchemaReader::readRowAndGetDataTypes() return {}; assertChar('(', buf); - PeekableReadBufferCheckpoint checkpoint(buf); - skipToNextRow(&buf, 0, 1); - buf.makeContinuousMemoryFromCheckpointToPos(); - buf.rollbackToCheckpoint(); - - Tokens tokens(buf.position(), buf.buffer().end()); - IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); - + skipWhitespaceIfAny(buf); DataTypes data_types; - bool finish = false; - while (!finish) + String value; + while (!buf.eof() && *buf.position() != ')') { - Expected expected; - ASTPtr ast; + if (!data_types.empty()) + { + skipWhitespaceIfAny(buf); + assertChar(',', buf); + skipWhitespaceIfAny(buf); + } - bool parsed = parser.parse(token_iterator, ast, expected); - /// Consider delimiter after value (',' or ')') as part of expression - parsed &= token_iterator->type == TokenType::Comma || token_iterator->type == TokenType::ClosingRoundBracket; - - if (!parsed) - throw Exception(ErrorCodes::SYNTAX_ERROR, "Cannot parse expression here: {}, token: {}", - String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())), String(token_iterator.get().begin, token_iterator.get().end)); - - std::pair result = evaluateConstantExpression(ast, context); - data_types.push_back(generalizeDataType(result.second)); - - if (token_iterator->type == TokenType::ClosingRoundBracket) - finish = true; - ++token_iterator; - buf.position() = const_cast(token_iterator->begin); + readQuotedFieldIntoString(value, buf); + auto type = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted); + data_types.push_back(std::move(type)); } + assertChar(')', buf); + skipWhitespaceIfAny(buf); if (!buf.eof() && *buf.position() == ',') ++buf.position(); @@ -642,9 +630,9 @@ void registerInputFormatValues(FormatFactory & factory) void registerValuesSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings) { - return std::make_shared(buf, settings, context); + return std::make_shared(buf, settings); }); } diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index e1521955472..77967181566 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -97,13 +97,13 @@ private: class ValuesSchemaReader : public IRowSchemaReader { public: - ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_); + ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings); private: DataTypes readRowAndGetDataTypes() override; PeekableReadBuffer buf; - ContextPtr context; + const FormatSettings format_settings; ParserExpression parser; bool first_row = true; }; diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 7720b01dc74..8ab8b55c9c2 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -295,12 +295,12 @@ void RowInputFormatWithNamesAndTypes::setReadBuffer(ReadBuffer & in_) FormatWithNamesAndTypesSchemaReader::FormatWithNamesAndTypesSchemaReader( ReadBuffer & in_, - size_t max_rows_to_read_, + const FormatSettings & format_settings, bool with_names_, bool with_types_, FormatWithNamesAndTypesReader * format_reader_, DataTypePtr default_type_) - : IRowSchemaReader(in_, max_rows_to_read_, default_type_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_) + : IRowSchemaReader(in_, format_settings, default_type_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_) { } diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index 25ffc8d6de2..8d24d23186b 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -124,7 +124,7 @@ class FormatWithNamesAndTypesSchemaReader : public IRowSchemaReader public: FormatWithNamesAndTypesSchemaReader( ReadBuffer & in, - size_t max_rows_to_read_, + const FormatSettings & format_settings, bool with_names_, bool with_types_, FormatWithNamesAndTypesReader * format_reader_, diff --git a/tests/queries/0_stateless/02149_schema_inference.reference b/tests/queries/0_stateless/02149_schema_inference.reference index f46e3bee101..8fca786df05 100644 --- a/tests/queries/0_stateless/02149_schema_inference.reference +++ b/tests/queries/0_stateless/02149_schema_inference.reference @@ -1,17 +1,17 @@ TSV -c1 Nullable(String) +c1 Nullable(Float64) c2 Nullable(String) -c3 Nullable(String) -c4 Nullable(String) -42 Some string [1, 2, 3, 4] (1, 2, 3) -42 abcd [] (4, 5, 6) +c3 Array(Nullable(Float64)) +c4 Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)) +42 Some string [1,2,3,4] (1,2,3) +42 abcd [] (4,5,6) TSVWithNames -number Nullable(String) +number Nullable(Float64) string Nullable(String) -array Nullable(String) -tuple Nullable(String) -42 Some string [1, 2, 3, 4] (1, 2, 3) -42 abcd [] (4, 5, 6) +array Array(Nullable(Float64)) +tuple Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)) +42 Some string [1,2,3,4] (1,2,3) +42 abcd [] (4,5,6) CSV c1 Nullable(Float64) c2 Nullable(String) @@ -74,12 +74,12 @@ s1 [] 1 \N [3] \N TSKV b Nullable(String) -c Nullable(String) -a Nullable(String) -s1 \N 1 +c Array(Nullable(Float64)) +a Nullable(Float64) +s1 [] 1 } [2] 2 -\N \N \N -\N \N \N +\N [] \N +\N [] \N \N [3] \N Values c1 Nullable(Float64) @@ -96,7 +96,7 @@ c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(Strin 42.42 \N [1,NULL,3] (1,NULL) ([1,2],[(3,'4'),(5,'6')]) \N Some string [10] (1,2) ([],[]) Regexp -c1 Nullable(String) +c1 Nullable(Float64) c2 Nullable(String) c3 Nullable(String) 42 Some string 1 [([1, 2, 3], String 1), ([], String 1)] diff --git a/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference index d3d2d86d696..b0ec4bef499 100644 --- a/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference +++ b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference @@ -1,137 +1,137 @@ Arrow -int8 Int8 -uint8 UInt8 -int16 Int16 -uint16 UInt16 -int32 Int32 -uint32 UInt32 -int64 Int64 -uint64 UInt64 +int8 Nullable(Int8) +uint8 Nullable(UInt8) +int16 Nullable(Int16) +uint16 Nullable(UInt16) +int32 Nullable(Int32) +uint32 Nullable(UInt32) +int64 Nullable(Int64) +uint64 Nullable(UInt64) 0 0 0 0 0 0 0 0 -1 1 -1 1 -1 1 -1 1 -float32 Float32 -float64 Float64 -decimal32 Decimal(9, 5) -decimal64 Decimal(18, 5) +float32 Nullable(Float32) +float64 Nullable(Float64) +decimal32 Nullable(Decimal(9, 5)) +decimal64 Nullable(Decimal(18, 5)) 0 0 0 0 1.2 0.7692307692307692 3.33333 333.33333 -date UInt16 -date32 Date32 +date Nullable(UInt16) +date32 Nullable(Date32) 0 1970-01-01 1 1970-01-02 -str String -fixed_string String +str Nullable(String) +fixed_string Nullable(String) Str: 0 100 Str: 1 200 -array Array(UInt64) -tuple Tuple(`tuple.0` UInt64, `tuple.1` String) -map Map(String, UInt64) +array Array(Nullable(UInt64)) +tuple Tuple(Nullable(UInt64), Nullable(String)) +map Map(String, Nullable(UInt64)) [0,1] (0,'0') {'0':0} [1,2] (1,'1') {'1':1} -nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) -nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64)))) +nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8)) [([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) [([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) ArrowStream -int8 Int8 -uint8 UInt8 -int16 Int16 -uint16 UInt16 -int32 Int32 -uint32 UInt32 -int64 Int64 -uint64 UInt64 +int8 Nullable(Int8) +uint8 Nullable(UInt8) +int16 Nullable(Int16) +uint16 Nullable(UInt16) +int32 Nullable(Int32) +uint32 Nullable(UInt32) +int64 Nullable(Int64) +uint64 Nullable(UInt64) 0 0 0 0 0 0 0 0 -1 1 -1 1 -1 1 -1 1 -float32 Float32 -float64 Float64 -decimal32 Decimal(9, 5) -decimal64 Decimal(18, 5) +float32 Nullable(Float32) +float64 Nullable(Float64) +decimal32 Nullable(Decimal(9, 5)) +decimal64 Nullable(Decimal(18, 5)) 0 0 0 0 1.2 0.7692307692307692 3.33333 333.33333 -date UInt16 -date32 Date32 +date Nullable(UInt16) +date32 Nullable(Date32) 0 1970-01-01 1 1970-01-02 -str String -fixed_string String +str Nullable(String) +fixed_string Nullable(String) Str: 0 100 Str: 1 200 -array Array(UInt64) -tuple Tuple(`tuple.0` UInt64, `tuple.1` String) -map Map(String, UInt64) +array Array(Nullable(UInt64)) +tuple Tuple(Nullable(UInt64), Nullable(String)) +map Map(String, Nullable(UInt64)) [0,1] (0,'0') {'0':0} [1,2] (1,'1') {'1':1} -nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) -nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64)))) +nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8)) [([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) [([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) Parquet -int8 Int8 -uint8 UInt8 -int16 Int16 -uint16 UInt16 -int32 Int32 -uint32 Int64 -int64 Int64 -uint64 UInt64 +int8 Nullable(Int8) +uint8 Nullable(UInt8) +int16 Nullable(Int16) +uint16 Nullable(UInt16) +int32 Nullable(Int32) +uint32 Nullable(Int64) +int64 Nullable(Int64) +uint64 Nullable(UInt64) 0 0 0 0 0 0 0 0 -1 1 -1 1 -1 1 -1 1 -float32 Float32 -float64 Float64 -decimal32 Decimal(9, 5) -decimal64 Decimal(18, 5) +float32 Nullable(Float32) +float64 Nullable(Float64) +decimal32 Nullable(Decimal(9, 5)) +decimal64 Nullable(Decimal(18, 5)) 0 0 0 0 1.2 0.7692307692307692 3.33333 333.33333 -date UInt16 -date32 Date32 +date Nullable(UInt16) +date32 Nullable(Date32) 0 1970-01-01 1 1970-01-02 -str String -fixed_string String +str Nullable(String) +fixed_string Nullable(String) Str: 0 100 Str: 1 200 -array Array(UInt64) -tuple Tuple(`tuple.0` UInt64, `tuple.1` String) -map Map(String, UInt64) +array Array(Nullable(UInt64)) +tuple Tuple(Nullable(UInt64), Nullable(String)) +map Map(String, Nullable(UInt64)) [0,1] (0,'0') {'0':0} [1,2] (1,'1') {'1':1} -nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) -nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64)))) +nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8)) [([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) [([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) ORC -int8 Int8 -uint8 Int8 -int16 Int16 -uint16 Int16 -int32 Int32 -uint32 Int32 -int64 Int64 -uint64 Int64 +int8 Nullable(Int8) +uint8 Nullable(Int8) +int16 Nullable(Int16) +uint16 Nullable(Int16) +int32 Nullable(Int32) +uint32 Nullable(Int32) +int64 Nullable(Int64) +uint64 Nullable(Int64) 0 0 0 0 0 0 0 0 -1 1 -1 1 -1 1 -1 1 -float32 Float32 -float64 Float64 -decimal32 Decimal(9, 5) -decimal64 Decimal(18, 5) +float32 Nullable(Float32) +float64 Nullable(Float64) +decimal32 Nullable(Decimal(9, 5)) +decimal64 Nullable(Decimal(18, 5)) 0 0 0 0 1.2 0.7692307692307692 3.33333 333.33333 -date Date32 -date32 Date32 +date Nullable(Date32) +date32 Nullable(Date32) 1970-01-01 1970-01-01 1970-01-02 1970-01-02 -str String -fixed_string String +str Nullable(String) +fixed_string Nullable(String) Str: 0 100 Str: 1 200 -array Array(Int64) -tuple Tuple(`tuple.0` Int64, `tuple.1` String) -map Map(String, Int64) +array Array(Nullable(Int64)) +tuple Tuple(Nullable(Int64), Nullable(String)) +map Map(String, Nullable(Int64)) [0,1] (0,'0') {'0':0} [1,2] (1,'1') {'1':1} -nested1 Array(Tuple(`nested1.0` Array(Int64), `nested1.1` Map(String, Int64))) -nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(Int64)), `nested2.0.1` Map(Int64, Array(Tuple(`nested2.0.1.0` Int64, `nested2.0.1.1` String)))), `nested2.1` Int8) +nested1 Array(Tuple(Array(Nullable(Int64)), Map(String, Nullable(Int64)))) +nested2 Tuple(Tuple(Array(Array(Nullable(Int64))), Map(Int64, Array(Tuple(Nullable(Int64), Nullable(String))))), Nullable(Int8)) [([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) [([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) Native diff --git a/tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.reference b/tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.reference new file mode 100644 index 00000000000..debc5c58936 --- /dev/null +++ b/tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.reference @@ -0,0 +1,40 @@ +Arrow +x Nullable(UInt64) +arr1 Array(Nullable(UInt64)) +arr2 Array(Array(Nullable(String))) +arr3 Array(Tuple(Nullable(String), Nullable(UInt64))) +0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)] +\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)] +2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)] +\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)] +4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)] +ArrowStream +x Nullable(UInt64) +arr1 Array(Nullable(UInt64)) +arr2 Array(Array(Nullable(String))) +arr3 Array(Tuple(Nullable(String), Nullable(UInt64))) +0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)] +\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)] +2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)] +\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)] +4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)] +Parquet +x Nullable(UInt64) +arr1 Array(Nullable(UInt64)) +arr2 Array(Array(Nullable(String))) +arr3 Array(Tuple(Nullable(String), Nullable(UInt64))) +0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)] +\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)] +2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)] +\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)] +4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)] +ORC +x Nullable(Int64) +arr1 Array(Nullable(Int64)) +arr2 Array(Array(Nullable(String))) +arr3 Array(Tuple(Nullable(String), Nullable(Int64))) +0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)] +\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)] +2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)] +\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)] +4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)] diff --git a/tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.sh b/tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.sh new file mode 100755 index 00000000000..1b6999e3f09 --- /dev/null +++ b/tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02242.data +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +for format in Arrow ArrowStream Parquet ORC +do + echo $format + $CLICKHOUSE_CLIENT -q "select number % 2 ? NULL : number as x, [number % 2 ? NULL : number, number + 1] as arr1, [[NULL, 'String'], [NULL], []] as arr2, [(NULL, NULL), ('String', NULL), (NULL, number)] as arr3 from numbers(5) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" +done + +rm $DATA_FILE diff --git a/tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.reference b/tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.reference new file mode 100644 index 00000000000..f599e28b8ab --- /dev/null +++ b/tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.reference @@ -0,0 +1 @@ +10 diff --git a/tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.sh b/tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.sh new file mode 100755 index 00000000000..cc8db7fb316 --- /dev/null +++ b/tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "drop table if exists test_02243" +$CLICKHOUSE_CLIENT -q "create table test_02243 (image_path Nullable(String), + caption Nullable(String), + NSFW Nullable(String), + similarity Nullable(Float64), + LICENSE Nullable(String), + url Nullable(String), + key Nullable(UInt64), + shard_id Nullable(UInt64), + status Nullable(String), + error_message Nullable(String), + width Nullable(UInt32), + height Nullable(UInt32), + exif Nullable(String), + original_width Nullable(UInt32), + original_height Nullable(UInt32)) engine=Memory" + +cat $CUR_DIR/data_parquet_bad_column/metadata_0.parquet | $CLICKHOUSE_CLIENT --stacktrace -q "insert into test_02243 format Parquet" + +$CLICKHOUSE_CLIENT -q "select count() from test_02243" +$CLICKHOUSE_CLIENT -q "drop table test_02243" diff --git a/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.reference b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.reference new file mode 100644 index 00000000000..d237caf630f --- /dev/null +++ b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.reference @@ -0,0 +1,8 @@ +x Nullable(String) +y Nullable(Float64) +x Nullable(String) +y Nullable(Float64) +x Nullable(String) +y Nullable(Float64) +x Nullable(String) +y Nullable(Float64) diff --git a/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql new file mode 100644 index 00000000000..cf9f312ab0c --- /dev/null +++ b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql @@ -0,0 +1,12 @@ +insert into function file('test_02244', 'TSV', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1; +desc file('test_02244', 'TSV') settings column_names_for_schema_inference='x,y'; + +insert into function file('test_02244', 'CSV', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1; +desc file('test_02244', 'CSV') settings column_names_for_schema_inference='x,y'; + +insert into function file('test_02244', 'JSONCompactEachRow', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1; +desc file('test_02244', 'JSONCompactEachRow') settings column_names_for_schema_inference='x,y'; + +insert into function file('test_02244', 'Values', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1; +desc file('test_02244', 'Values') settings column_names_for_schema_inference='x,y'; + diff --git a/tests/queries/0_stateless/02245_parquet_skip_unknown_type.reference b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.reference new file mode 100644 index 00000000000..4f9cde534f0 --- /dev/null +++ b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.reference @@ -0,0 +1,16 @@ +OK +image_path Nullable(String) +caption Nullable(String) +NSFW Nullable(String) +similarity Nullable(Float64) +LICENSE Nullable(String) +url Nullable(String) +key Nullable(Int64) +shard_id Nullable(Int64) +status Nullable(String) +width Nullable(Int64) +height Nullable(Int64) +exif Nullable(String) +original_width Nullable(Int64) +original_height Nullable(Int64) +10 diff --git a/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh new file mode 100755 index 00000000000..005c089e434 --- /dev/null +++ b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02245.parquet +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +cp $CUR_DIR/data_parquet_bad_column/metadata_0.parquet $DATA_FILE + + +$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet)" 2>&1 | grep -qF "CANNOT_EXTRACT_TABLE_STRUCTURE" && echo "OK" || echo "FAIL" +$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1" +$CLICKHOUSE_CLIENT -q "select count(*) from file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1" + diff --git a/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference new file mode 100644 index 00000000000..c245f13fdbe --- /dev/null +++ b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference @@ -0,0 +1,107 @@ +TSV +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Nullable(Float64)) +c4 Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)) +42 Some string [1,2,3,4] (1,2,3) +42 abcd [] (4,5,6) +c1 Nullable(String) +[({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)] +[] +[({}, [], 0)] +[({}, [NULL], NULL)] +[({}, [\'String3\'], NULL)] +[({\'key3\': NULL}, []), NULL] +c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64))) +[({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)] +[] +[({},[],0)] +[({},[NULL],NULL)] +[({},['String3'],NULL)] +[({'key3':NULL},[],NULL)] +c1 Nullable(Bool) +true +false +\N +c1 Array(Nullable(Bool)) +[true,NULL] +[] +[NULL] +[false] +c1 Nullable(String) +[] +c1 Nullable(String) +{} +c1 Nullable(String) +() +c1 Nullable(String) +[1, 2, 3 +c1 Nullable(String) +[(1, 2, 3 4)] +c1 Nullable(String) +[1, 2, 3 + 4] +c1 Nullable(String) +(1, 2, +c1 Nullable(String) +[1, Some trash, 42.2] +c1 Nullable(String) +[1, \'String\', {\'key\' : 2}] +c1 Nullable(String) +{\'key\' : 1, [1] : 10} +c1 Nullable(String) +{}{} +c1 Nullable(String) +[1, 2, 3 +c1 Nullable(String) +[abc, def] +c1 Array(Nullable(String)) +['abc','def'] +c1 Nullable(String) +[\'string] +c1 Nullable(String) +\'string +c1 Nullable(Float64) +42.42 +c1 Nullable(String) +42.42sometrash +c1 Nullable(String) +[42.42sometrash, 42.42] + +CSV +c1 Nullable(String) +c2 Nullable(String) +c3 Array(Nullable(Float64)) +c4 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))) +42 Some string [1,2,3,4] [(1,2,3)] +42\\ abcd [] [(4,5,6)] +c1 Nullable(String) +[({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)] +[] +[({}, [], 0)] +[({}, [NULL], NULL)] +[({}, [\'String3\'], NULL)] +[({\'key3\': NULL}, []), NULL] +c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64))) +[({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)] +[] +[({},[],0)] +[({},[NULL],NULL)] +[({},['String3'],NULL)] +[({'key3':NULL},[],NULL)] +c1 Nullable(Bool) +true +false +\N +c1 Array(Nullable(Bool)) +[true,NULL] +[] +[NULL] +[false] +c1 Nullable(String) +(1, 2, 3) +c1 Nullable(String) +123.123 +c1 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))) +[(1,2,3)] +c1 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))) +[(1,2,3)] diff --git a/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.sh b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.sh new file mode 100755 index 00000000000..6589765f739 --- /dev/null +++ b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.sh @@ -0,0 +1,220 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME + +touch $DATA_FILE + +echo "TSV" + +echo -e "42\tSome string\t[1, 2, 3, 4]\t(1, 2, 3) +42\tabcd\t[]\t(4, 5, 6)" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)] +[] +[({}, [], 0)] +[({}, [NULL], NULL)] +[({}, ['String3'], NULL)] +[({'key3': NULL}, []), NULL]"> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV') settings input_format_tsv_use_best_effort_in_schema_inference=false" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV') settings input_format_tsv_use_best_effort_in_schema_inference=false" + + +echo -e "[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)] +[] +[({}, [], 0)] +[({}, [NULL], NULL)] +[({}, ['String3'], NULL)] +[({'key3': NULL}, [], NULL)]"> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "true +false +\N" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[true, NULL] +[] +[NULL] +[false]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "{}" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "()" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[1, 2, 3" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[(1, 2, 3 4)]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[1, 2, 3 + 4]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "(1, 2," > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[1, Some trash, 42.2]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[1, 'String', {'key' : 2}]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "{'key' : 1, [1] : 10}" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "{}{}" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[1, 2, 3" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[abc, def]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "['abc', 'def']" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "['string]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "'string" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "42.42" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "42.42sometrash" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[42.42sometrash, 42.42]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + + +echo +echo "CSV" + +echo -e "42,Some string,'[1, 2, 3, 4]','[(1, 2, 3)]' +42\,abcd,'[]','[(4, 5, 6)]'" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "\"[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)]\" +'[]' +'[({}, [], 0)]' +'[({}, [NULL], NULL)]' +\"[({}, ['String3'], NULL)]\" +\"[({'key3': NULL}, []), NULL]\""> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV') settings input_format_csv_use_best_effort_in_schema_inference=false" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV') settings input_format_csv_use_best_effort_in_schema_inference=false" + +echo -e "\"[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)]\" +'[]' +'[({}, [], 0)]' +'[({}, [NULL], NULL)]' +\"[({}, ['String3'], NULL)]\" +\"[({'key3': NULL}, [], NULL)]\""> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "true +false +\N" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "'[true, NULL]' +'[]' +'[NULL]' +'[false]'" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + + +echo -e "'(1, 2, 3)'"> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "'123.123'"> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "'[(1, 2, 3)]'"> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "\"[(1, 2, 3)]\""> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + + From abc020a502962239cd3cf5f0585684a16d0f7fc5 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 24 Mar 2022 13:08:58 +0000 Subject: [PATCH 02/17] Clean up --- src/Formats/EscapingRuleUtils.cpp | 1 + src/Formats/EscapingRuleUtils.h | 17 +++++++++++------ src/Formats/FormatSettings.h | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 08b34ebb0fc..b3e36e9c14a 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -429,6 +429,7 @@ static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) if (checkStringCaseInsensitive("NULL", buf)) return std::make_shared(); + /// Number Float64 tmp; if (tryReadFloatText(tmp, buf)) return std::make_shared(); diff --git a/src/Formats/EscapingRuleUtils.h b/src/Formats/EscapingRuleUtils.h index 3c7c768c003..b4f609cd9a6 100644 --- a/src/Formats/EscapingRuleUtils.h +++ b/src/Formats/EscapingRuleUtils.h @@ -43,12 +43,17 @@ String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule es /// - For JSON escaping rule we can use JSON parser to parse a single field /// and then convert JSON type of this field to ClickHouse type. /// - For CSV escaping rule we can do the next: -/// - If the field is an unquoted string, then we could try to evaluate it -/// as a constant expression, and if it fails, treat it as a String. -/// - If the field is a string in quotes, then we can try to evaluate -/// expression inside quotes as a constant expression, and if it fails or -/// the result is a number (we don't parse numbers in quotes) we treat it as a String. -/// - For TSV and TSVRaw we treat each field as a String (TODO: try to use some tweaks and heuristics here) +/// - If the field is an unquoted string, then we try to parse it as s number, +/// and if we cannot, treat it as a String. +/// - If the field is a string in quotes, then we try to use some +/// tweaks and heuristics to determine the type inside quotes, and if we can't or +/// the result is a number or tuple (we don't parse numbers in quotes and don't +/// support tuples in CSV) we treat it as a String. +/// - If input_format_csv_use_best_effort_in_schema_inference is disabled, we +/// treat everything as a string. +/// - For TSV and TSVRaw we try to use some tweaks and heuristics to determine the type +/// of value if setting input_format_tsv_use_best_effort_in_schema_inference is enabled, +/// otherwise we treat everything as a string. DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule); DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule); diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 6ecd04536a6..4b39d255110 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -112,7 +112,7 @@ struct FormatSettings char fields_delimiter = '\x01'; char collection_items_delimiter = '\x02'; char map_keys_delimiter = '\x03'; - Names input_field_names = {""}; + Names input_field_names; } hive_text; struct Custom From 3b801a4093322dd048f286d410ef7f1d952a972c Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 24 Mar 2022 19:16:41 +0100 Subject: [PATCH 03/17] Update src/Processors/Formats/ISchemaReader.cpp Co-authored-by: Vladimir C --- src/Processors/Formats/ISchemaReader.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 17db8865310..a80c56a0449 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -20,6 +20,13 @@ IRowSchemaReader::IRowSchemaReader( { /// column_names_for_schema_inference is a string in format 'column1,column2,column3,...' boost::split(column_names, format_settings.column_names_for_schema_inference, boost::is_any_of(",")); + for (size_t i = 0 ; i < column_names.size() ; ++i) + { + std::string col_name_trimmed = column_names[i]; + boost::trim(col_name_trimmed); + if (!col_name.empty()) + column_names[i] = col_name_trimmed; + } } } From 6a9df9d471190ddf60ec59a9385d821f2678bee0 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 24 Mar 2022 19:16:47 +0100 Subject: [PATCH 04/17] Update src/Processors/Formats/ISchemaReader.cpp Co-authored-by: Vladimir C --- src/Processors/Formats/ISchemaReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index a80c56a0449..a74de0447cb 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -58,7 +58,7 @@ NamesAndTypesList IRowSchemaReader::readSchema() { if (default_type) data_types[i] = default_type; - else if (!default_types.empty() && i < default_types.size() && default_types[i]) + else if (i < default_types.size() && default_types[i]) data_types[i] = default_types[i]; else throw Exception( From 287e1a6efc6d63dce9a111821079e87bc21321f8 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 24 Mar 2022 19:16:52 +0100 Subject: [PATCH 05/17] Update src/Processors/Formats/ISchemaReader.cpp Co-authored-by: Vladimir C --- src/Processors/Formats/ISchemaReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index a74de0447cb..e421692743d 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -93,7 +93,7 @@ NamesAndTypesList IRowSchemaReader::readSchema() { if (default_type) data_types[i] = default_type; - else if (!default_types.empty() && i < default_types.size() && default_types[i]) + else if (i < default_types.size() && default_types[i]) data_types[i] = default_types[i]; else throw Exception( From 1823cac89d24d906c4f5894d9dd8814bbcd694dc Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 24 Mar 2022 19:19:32 +0100 Subject: [PATCH 06/17] Update src/Formats/EscapingRuleUtils.h Co-authored-by: Vladimir C --- src/Formats/EscapingRuleUtils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Formats/EscapingRuleUtils.h b/src/Formats/EscapingRuleUtils.h index b4f609cd9a6..1ce04a8d1b7 100644 --- a/src/Formats/EscapingRuleUtils.h +++ b/src/Formats/EscapingRuleUtils.h @@ -43,7 +43,7 @@ String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule es /// - For JSON escaping rule we can use JSON parser to parse a single field /// and then convert JSON type of this field to ClickHouse type. /// - For CSV escaping rule we can do the next: -/// - If the field is an unquoted string, then we try to parse it as s number, +/// - If the field is an unquoted string, then we try to parse it as a number, /// and if we cannot, treat it as a String. /// - If the field is a string in quotes, then we try to use some /// tweaks and heuristics to determine the type inside quotes, and if we can't or From ae92963b15ec6b48408e994fd21bd2898935c209 Mon Sep 17 00:00:00 2001 From: Vladimir C Date: Fri, 25 Mar 2022 11:30:25 +0100 Subject: [PATCH 07/17] Fix build error in Formats/ISchemaReader.cpp --- src/Processors/Formats/ISchemaReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index e421692743d..392b636e073 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -24,7 +24,7 @@ IRowSchemaReader::IRowSchemaReader( { std::string col_name_trimmed = column_names[i]; boost::trim(col_name_trimmed); - if (!col_name.empty()) + if (!col_name_trimmed.empty()) column_names[i] = col_name_trimmed; } } From 6fb3c3be043b7006e0ef63b62f22cb7620853145 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 25 Mar 2022 12:02:21 +0000 Subject: [PATCH 08/17] Fix comments and build --- src/Processors/Formats/ISchemaReader.cpp | 35 ++++++++++++++----- src/Processors/Formats/ISchemaReader.h | 5 ++- .../Formats/Impl/ArrowColumnToCHColumn.cpp | 2 +- .../Formats/Impl/TemplateRowInputFormat.cpp | 2 +- 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 392b636e073..0709b8f4d75 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -12,24 +12,35 @@ namespace ErrorCodes } IRowSchemaReader::IRowSchemaReader( - ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_, const DataTypes & default_types_) + ReadBuffer & in_, const FormatSettings & format_settings) : ISchemaReader(in_) - , max_rows_to_read(format_settings.max_rows_to_read_for_schema_inference), default_type(default_type_), default_types(default_types_) + , max_rows_to_read(format_settings.max_rows_to_read_for_schema_inference) { if (!format_settings.column_names_for_schema_inference.empty()) { /// column_names_for_schema_inference is a string in format 'column1,column2,column3,...' boost::split(column_names, format_settings.column_names_for_schema_inference, boost::is_any_of(",")); - for (size_t i = 0 ; i < column_names.size() ; ++i) + for (auto & column_name : column_names) { - std::string col_name_trimmed = column_names[i]; - boost::trim(col_name_trimmed); + std::string col_name_trimmed = boost::trim_copy(column_name); if (!col_name_trimmed.empty()) - column_names[i] = col_name_trimmed; + column_name = col_name_trimmed; } } } +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_) + : IRowSchemaReader(in_, format_settings) +{ + default_type = default_type_; +} + +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, const DataTypes & default_types_) + : IRowSchemaReader(in_, format_settings) +{ + default_types = default_types_; +} + NamesAndTypesList IRowSchemaReader::readSchema() { DataTypes data_types = readRowAndGetDataTypes(); @@ -63,7 +74,11 @@ NamesAndTypesList IRowSchemaReader::readSchema() else throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", new_data_types[i]->getName(), i + 1, row, data_types[i]->getName()); + "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", + new_data_types[i]->getName(), + i + 1, + row, + data_types[i]->getName()); } } } @@ -146,7 +161,11 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() else throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", type->getName(), name, row, new_type->getName()); + "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", + type->getName(), + name, + row, + new_type->getName()); } } } diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index cd0b552c021..1716b78f1b4 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -34,7 +34,10 @@ protected: class IRowSchemaReader : public ISchemaReader { public: - IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_ = nullptr, const DataTypes & default_types_ = {}); + IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings); + IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_); + IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, const DataTypes & default_types_); + NamesAndTypesList readSchema() override; protected: diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index c17c86d51ea..bb58e851ff8 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -684,7 +684,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & std::vector ArrowColumnToCHColumn::getMissingColumns(const arrow::Schema & schema) const { std::vector missing_columns; - auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, &header, case_insensitive_matching); + auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, false, &header, case_insensitive_matching); auto flatten_block_from_arrow = Nested::flatten(block_from_arrow); for (size_t i = 0, columns = header.columns(); i < columns; ++i) diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index a1f70730b39..df4d49b172c 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -454,7 +454,7 @@ TemplateSchemaReader::TemplateSchemaReader( const ParsedTemplateFormatString & row_format_, std::string row_between_delimiter, const FormatSettings & format_settings_) - : IRowSchemaReader(buf, format_settings_, nullptr, getDefaultDataTypeForEscapingRules(row_format_.escaping_rules)) + : IRowSchemaReader(buf, format_settings_, getDefaultDataTypeForEscapingRules(row_format_.escaping_rules)) , buf(in_) , format(format_) , row_format(row_format_) From fddeecdd699c049e07b60010e57a005a500eb884 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 28 Mar 2022 21:59:43 +0200 Subject: [PATCH 09/17] Fix fast test --- .../0_stateless/02244_column_names_in_shcmea_inference.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql index cf9f312ab0c..4733690b225 100644 --- a/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql +++ b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql @@ -1,3 +1,5 @@ +-- Tags: no-fasttest + insert into function file('test_02244', 'TSV', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1; desc file('test_02244', 'TSV') settings column_names_for_schema_inference='x,y'; From 97f5033ea98d7f1e06a21210ad9460a121300653 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 29 Mar 2022 13:07:37 +0000 Subject: [PATCH 10/17] Fix tests --- src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp | 6 +++++- .../02166_arrow_dictionary_inference.reference | 2 +- .../0_stateless/02166_arrow_dictionary_inference.sh | 2 +- .../02211_shcema_inference_from_stdin.reference | 6 +++--- .../02240_tskv_schema_inference_bug.reference | 10 +++++----- .../02244_column_names_in_shcmea_inference.sql | 2 +- 6 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index bb58e851ff8..c792d828e44 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -553,7 +553,11 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader( auto arrow_column = std::make_shared(array_vector); std::unordered_map> dict_values; bool skipped = false; - ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false, false, skip_columns_with_unsupported_types, skipped); + bool allow_null_type = false; + if (hint_header && hint_header->has(field->name()) && hint_header->getByName(field->name()).type->isNullable()) + allow_null_type = true; + ColumnWithTypeAndName sample_column = readColumnFromArrowColumn( + arrow_column, field->name(), format_name, false, dict_values, false, allow_null_type, skip_columns_with_unsupported_types, skipped); if (!skipped) sample_columns.emplace_back(std::move(sample_column)); } diff --git a/tests/queries/0_stateless/02166_arrow_dictionary_inference.reference b/tests/queries/0_stateless/02166_arrow_dictionary_inference.reference index 46f448cfba7..20f3368e446 100644 --- a/tests/queries/0_stateless/02166_arrow_dictionary_inference.reference +++ b/tests/queries/0_stateless/02166_arrow_dictionary_inference.reference @@ -1 +1 @@ -x LowCardinality(UInt64) +x LowCardinality(Nullable(UInt64)) diff --git a/tests/queries/0_stateless/02166_arrow_dictionary_inference.sh b/tests/queries/0_stateless/02166_arrow_dictionary_inference.sh index e560dc10d2c..7d313b571d9 100755 --- a/tests/queries/0_stateless/02166_arrow_dictionary_inference.sh +++ b/tests/queries/0_stateless/02166_arrow_dictionary_inference.sh @@ -5,7 +5,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -q "insert into table function file('arrow.dict', 'Arrow', 'x LowCardinality(UInt64)') select number from numbers(10) settings output_format_arrow_low_cardinality_as_dictionary=1" +$CLICKHOUSE_CLIENT -q "insert into table function file('arrow.dict', 'Arrow', 'x LowCardinality(UInt64)') select number from numbers(10) settings output_format_arrow_low_cardinality_as_dictionary=1, engine_file_truncate_on_insert=1" $CLICKHOUSE_CLIENT -q "desc file('arrow.dict', 'Arrow')" diff --git a/tests/queries/0_stateless/02211_shcema_inference_from_stdin.reference b/tests/queries/0_stateless/02211_shcema_inference_from_stdin.reference index d176e0ee1ed..6920aa16198 100644 --- a/tests/queries/0_stateless/02211_shcema_inference_from_stdin.reference +++ b/tests/queries/0_stateless/02211_shcema_inference_from_stdin.reference @@ -9,7 +9,7 @@ x Nullable(Float64) 7 8 9 -c1 Nullable(String) -c2 Nullable(String) -c3 Nullable(String) +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(Float64) 1 2 3 diff --git a/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference b/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference index a8abc33648e..7950770cfd4 100644 --- a/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference +++ b/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference @@ -1,8 +1,8 @@ b Nullable(String) -c Nullable(String) -a Nullable(String) -s1 \N 1 +c Array(Nullable(Float64)) +a Nullable(Float64) +s1 [] 1 } [2] 2 -\N \N \N -\N \N \N +\N [] \N +\N [] \N \N [3] \N diff --git a/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql index 4733690b225..af56856f0be 100644 --- a/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql +++ b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql @@ -1,4 +1,4 @@ --- Tags: no-fasttest +-- Tags: no-fasttest, no-parallel insert into function file('test_02244', 'TSV', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1; desc file('test_02244', 'TSV') settings column_names_for_schema_inference='x,y'; From a2fd09e0314f83d9e64ea758c973ea56f39d740c Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 29 Mar 2022 16:34:07 +0200 Subject: [PATCH 11/17] Fix style --- src/Formats/EscapingRuleUtils.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index b3e36e9c14a..870202faf72 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -23,7 +23,6 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; - extern const int LOGICAL_ERROR; } FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule) From 666ef3586c5d200a277b2a806b289f9781360628 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 29 Mar 2022 16:42:01 +0200 Subject: [PATCH 12/17] Fix typos check --- utils/check-style/codespell-ignore-words.list | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/check-style/codespell-ignore-words.list b/utils/check-style/codespell-ignore-words.list index d3a7586647c..7aabaff17c5 100644 --- a/utils/check-style/codespell-ignore-words.list +++ b/utils/check-style/codespell-ignore-words.list @@ -10,3 +10,4 @@ ths offsett numer ue +alse From 265fbaaa5aadf4e65e3569e840550690b3025586 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 30 Mar 2022 13:23:43 +0000 Subject: [PATCH 13/17] Fix tests --- .../02245_s3_schema_desc.reference | 24 +++++++++---------- .../0_stateless/02245_s3_schema_desc.sql | 17 ++++++------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/tests/queries/0_stateless/02245_s3_schema_desc.reference b/tests/queries/0_stateless/02245_s3_schema_desc.reference index a5b0f81a2c7..e039680d933 100644 --- a/tests/queries/0_stateless/02245_s3_schema_desc.reference +++ b/tests/queries/0_stateless/02245_s3_schema_desc.reference @@ -1,21 +1,21 @@ -c1 Nullable(String) -c2 Nullable(String) -c3 Nullable(String) -c1 Nullable(String) -c2 Nullable(String) -c3 Nullable(String) +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(Float64) +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(Float64) c1 UInt64 c2 UInt64 c3 UInt64 -c1 Nullable(String) -c2 Nullable(String) -c3 Nullable(String) +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(Float64) c1 UInt64 c2 UInt64 c3 UInt64 -c1 Nullable(String) -c2 Nullable(String) -c3 Nullable(String) +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(Float64) c1 UInt64 c2 UInt64 c3 UInt64 diff --git a/tests/queries/0_stateless/02245_s3_schema_desc.sql b/tests/queries/0_stateless/02245_s3_schema_desc.sql index 4ab870e1379..72800f3ab38 100644 --- a/tests/queries/0_stateless/02245_s3_schema_desc.sql +++ b/tests/queries/0_stateless/02245_s3_schema_desc.sql @@ -1,13 +1,14 @@ -- Tags: no-fasttest -- Tag no-fasttest: Depends on AWS -desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv'); -desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'TSV'); -desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64'); -desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest'); -desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64', 'auto'); -desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest', 'TSV'); -desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64'); -desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64', 'auto'); +desc file('data_minio/{a,b,c}.tsv'); +desc file('data_minio/{a,b,c}.tsv', 'TSV'); +desc file('data_minio/{a,b,c}.tsv', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64'); +desc file('data_minio/{a,b,c}.tsv'); +desc file('data_minio/{a,b,c}.tsv', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64', 'auto'); +desc file('data_minio/{a,b,c}.tsv', 'TSV'); +desc file('data_minio/{a,b,c}.tsv', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64'); +desc file('data_minio/{a,b,c}.tsv', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64', 'auto'); + SELECT * FROM s3(decodeURLComponent(NULL), [NULL]); --{serverError 170} From 564a77c462eda1b1e432008d758f09b8d82a1a26 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 31 Mar 2022 12:49:23 +0200 Subject: [PATCH 14/17] Fix build --- src/Formats/EscapingRuleUtils.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 870202faf72..e9d7e464cce 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -466,8 +466,6 @@ DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSe if (field.size() > 1 && ((field.front() == '\'' && field.back() == '\'') || (field.front() == '"' && field.back() == '"'))) { - auto s = std::string_view(field.data() + 1, field.size() - 2); - ReadBufferFromString buf(std::string_view(field.data() + 1, field.size() - 2)); /// Try to determine the type of value inside quotes auto type = determineDataTypeForSingleField(buf); From 4db5043ed45fffe512d716bd056a3dc5ae76a1d2 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 31 Mar 2022 12:50:53 +0200 Subject: [PATCH 15/17] Fix test --- .../queries/0_stateless/02245_s3_schema_desc.sql | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/queries/0_stateless/02245_s3_schema_desc.sql b/tests/queries/0_stateless/02245_s3_schema_desc.sql index 72800f3ab38..2cd362ff233 100644 --- a/tests/queries/0_stateless/02245_s3_schema_desc.sql +++ b/tests/queries/0_stateless/02245_s3_schema_desc.sql @@ -1,14 +1,14 @@ -- Tags: no-fasttest -- Tag no-fasttest: Depends on AWS -desc file('data_minio/{a,b,c}.tsv'); -desc file('data_minio/{a,b,c}.tsv', 'TSV'); -desc file('data_minio/{a,b,c}.tsv', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64'); -desc file('data_minio/{a,b,c}.tsv'); -desc file('data_minio/{a,b,c}.tsv', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64', 'auto'); -desc file('data_minio/{a,b,c}.tsv', 'TSV'); -desc file('data_minio/{a,b,c}.tsv', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64'); -desc file('data_minio/{a,b,c}.tsv', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64', 'auto'); +desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv'); +desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'TSV'); +desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64'); +desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest'); +desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64', 'auto'); +desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest', 'TSV'); +desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64'); +desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64', 'auto'); SELECT * FROM s3(decodeURLComponent(NULL), [NULL]); --{serverError 170} From a41c73fcf6d90e381691ab6f2bb60809196e6bf8 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 31 Mar 2022 11:25:52 +0000 Subject: [PATCH 16/17] Fix tests --- .../0_stateless/01801_s3_cluster.reference | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/tests/queries/0_stateless/01801_s3_cluster.reference b/tests/queries/0_stateless/01801_s3_cluster.reference index 31c97f14fa3..0448ff3933b 100644 --- a/tests/queries/0_stateless/01801_s3_cluster.reference +++ b/tests/queries/0_stateless/01801_s3_cluster.reference @@ -2,30 +2,6 @@ 0 0 0 0 0 0 1 2 3 -10 11 12 -13 14 15 -16 17 18 -20 21 22 -23 24 25 -26 27 28 -4 5 6 -7 8 9 -0 0 0 -0 0 0 -0 0 0 -1 2 3 -10 11 12 -13 14 15 -16 17 18 -20 21 22 -23 24 25 -26 27 28 -4 5 6 -7 8 9 -0 0 0 -0 0 0 -0 0 0 -1 2 3 4 5 6 7 8 9 10 11 12 @@ -38,14 +14,26 @@ 0 0 0 0 0 0 1 2 3 +4 5 6 +7 8 9 10 11 12 13 14 15 16 17 18 20 21 22 23 24 25 26 27 28 +0 0 0 +0 0 0 +0 0 0 +1 2 3 4 5 6 7 8 9 +10 11 12 +13 14 15 +16 17 18 +20 21 22 +23 24 25 +26 27 28 0 0 0 0 0 0 0 0 0 @@ -62,14 +50,26 @@ 0 0 0 0 0 0 1 2 3 +4 5 6 +7 8 9 10 11 12 13 14 15 16 17 18 20 21 22 23 24 25 26 27 28 +0 0 0 +0 0 0 +0 0 0 +1 2 3 4 5 6 7 8 9 +10 11 12 +13 14 15 +16 17 18 +20 21 22 +23 24 25 +26 27 28 0 0 0 0 0 0 0 0 0 From 1c783ed88abbad3b144d942aa3a035faf5b27d6c Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 7 Apr 2022 12:17:48 +0000 Subject: [PATCH 17/17] Resolve conflicts --- .../Impl/JSONCompactEachRowRowInputFormat.cpp | 11 +++-------- .../02149_schema_inference.reference | 8 ++++---- .../02240_tskv_schema_inference_bug.reference | 8 ++++---- ...2247_names_order_in_json_and_tskv.reference | 18 +++++++++--------- 4 files changed, 20 insertions(+), 25 deletions(-) diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index 798cb69d759..854aefc7562 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -181,15 +181,10 @@ bool JSONCompactEachRowFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & return true; } -JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) +JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader( + ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesSchemaReader( - in_, - format_settings_ - with_names_, - with_types_, - &reader, - nullptr, - format_settings_.json.read_bools_as_numbers) + in_, format_settings_, with_names_, with_types_, &reader, nullptr, format_settings_.json.read_bools_as_numbers) , reader(in_, yield_strings_, format_settings_) { } diff --git a/tests/queries/0_stateless/02149_schema_inference.reference b/tests/queries/0_stateless/02149_schema_inference.reference index fe87552e776..2d7dd5caca7 100644 --- a/tests/queries/0_stateless/02149_schema_inference.reference +++ b/tests/queries/0_stateless/02149_schema_inference.reference @@ -75,11 +75,11 @@ c Array(Nullable(Float64)) TSKV a Nullable(Float64) b Nullable(String) -c Array(Nullable(Float64)) -1 s1 \N +c Array(Nullable(Float64)) +1 s1 [] 2 } [2] -\N \N \N -\N \N \N +\N \N [] +\N \N [] \N \N [3] Values c1 Nullable(Float64) diff --git a/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference b/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference index 5467b32ffb0..d0ced74f8f6 100644 --- a/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference +++ b/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference @@ -1,8 +1,8 @@ a Nullable(Float64) b Nullable(String) -c Array(Nullable(Float64)) -1 s1 \N +c Array(Nullable(Float64)) +1 s1 [] 2 } [2] -\N \N \N -\N \N \N +\N \N [] +\N \N [] \N \N [3] diff --git a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference index 49a285dc11a..300846c17a0 100644 --- a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference +++ b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference @@ -1,15 +1,15 @@ -a Nullable(String) +a Nullable(Float64) b Nullable(String) -c Nullable(String) -1 s1 \N +c Array(Nullable(Float64)) +1 s1 [] 2 } [2] -\N \N \N -\N \N \N +\N \N [] +\N \N [] \N \N [3] -b Nullable(String) -a Nullable(String) -c Nullable(String) -e Nullable(String) +b Nullable(Float64) +a Nullable(Float64) +c Nullable(Float64) +e Nullable(Float64) 1 \N \N \N \N 2 3 \N \N \N \N \N