diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 50296b7bd43..980d0428d27 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -641,6 +641,12 @@ class IColumn; M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \ M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \ M(UInt64, input_format_max_rows_to_read_for_schema_inference, 100, "The maximum rows of data to read for automatic schema inference", 0) \ + M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ + M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ + M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Parquet", 0) \ + M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format ORC", 0) \ + M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Arrow", 0) \ + M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \ M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \ \ M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic', 'best_effort' and 'best_effort_us'.", 0) \ diff --git a/src/DataTypes/DataTypeMap.cpp b/src/DataTypes/DataTypeMap.cpp index 41de17982aa..42ec739c33b 100644 --- a/src/DataTypes/DataTypeMap.cpp +++ b/src/DataTypes/DataTypeMap.cpp @@ -45,22 +45,7 @@ DataTypeMap::DataTypeMap(const DataTypePtr & key_type_, const DataTypePtr & valu void DataTypeMap::assertKeyType() const { - bool type_error = false; - if (key_type->getTypeId() == TypeIndex::LowCardinality) - { - const auto & low_cardinality_data_type = assert_cast(*key_type); - if (!isStringOrFixedString(*(low_cardinality_data_type.getDictionaryType()))) - type_error = true; - } - else if (!key_type->isValueRepresentedByInteger() - && !isStringOrFixedString(*key_type) - && !WhichDataType(key_type).isNothing() - && !WhichDataType(key_type).isUUID()) - { - type_error = true; - } - - if (type_error) + if (!checkKeyType(key_type)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Type of Map key must be a type, that can be represented by integer or String or FixedString (possibly LowCardinality) or UUID," " but {} given", key_type->getName()); @@ -102,6 +87,25 @@ bool DataTypeMap::equals(const IDataType & rhs) const return nested->equals(*rhs_map.nested); } +bool DataTypeMap::checkKeyType(DataTypePtr key_type) +{ + if (key_type->getTypeId() == TypeIndex::LowCardinality) + { + const auto & low_cardinality_data_type = assert_cast(*key_type); + if (!isStringOrFixedString(*(low_cardinality_data_type.getDictionaryType()))) + return false; + } + else if (!key_type->isValueRepresentedByInteger() + && !isStringOrFixedString(*key_type) + && !WhichDataType(key_type).isNothing() + && !WhichDataType(key_type).isUUID()) + { + return false; + } + + return true; +} + static DataTypePtr create(const ASTPtr & arguments) { if (!arguments || arguments->children.size() != 2) diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index 65bdd93ca4d..479008031fe 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -48,6 +48,8 @@ public: SerializationPtr doGetDefaultSerialization() const override; + static bool checkKeyType(DataTypePtr key_type); + private: void assertKeyType() const; }; diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index b0ea10abdb6..e9d7e464cce 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -5,12 +5,17 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include #include -#include +#include #include -#include -#include + namespace DB { @@ -18,7 +23,6 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; - extern const int LOGICAL_ERROR; } FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule) @@ -138,7 +142,8 @@ bool deserializeFieldByEscapingRule( serialization->deserializeTextRaw(column, buf, format_settings); break; default: - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Escaping rule {} is not suitable for deserialization", escapingRuleToString(escaping_rule)); + throw Exception( + ErrorCodes::BAD_ARGUMENTS, "Escaping rule {} is not suitable for deserialization", escapingRuleToString(escaping_rule)); } return read; } @@ -176,7 +181,8 @@ void serializeFieldByEscapingRule( } } -void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +void writeStringByEscapingRule( + const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { switch (escaping_rule) { @@ -249,85 +255,269 @@ String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule e return readByEscapingRule(buf, escaping_rule, format_settings); } -static bool evaluateConstantExpressionFromString(const StringRef & field, DataTypePtr & type, ContextPtr context) +static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) { - if (!context) - throw Exception(ErrorCodes::LOGICAL_ERROR, "You must provide context to evaluate constant expression"); + if (buf.eof()) + return nullptr; - ParserExpression parser; - Expected expected; - Tokens tokens(field.data, field.data + field.size); - IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); - ASTPtr ast; - - /// FIXME: Our parser cannot parse maps in the form of '{key : value}' that is used in text formats. - bool parsed = parser.parse(token_iterator, ast, expected); - if (!parsed || !token_iterator->isEnd()) - return false; - - try + /// Array + if (checkChar('[', buf)) { - std::pair result = evaluateConstantExpression(ast, context); - type = generalizeDataType(result.second); - return true; + skipWhitespaceIfAny(buf); + + DataTypes nested_types; + bool first = true; + while (!buf.eof() && *buf.position() != ']') + { + if (!first) + { + skipWhitespaceIfAny(buf); + if (!checkChar(',', buf)) + return nullptr; + skipWhitespaceIfAny(buf); + } + else + first = false; + + auto nested_type = determineDataTypeForSingleFieldImpl(buf); + if (!nested_type) + return nullptr; + + nested_types.push_back(nested_type); + } + + if (buf.eof()) + return nullptr; + + ++buf.position(); + + if (nested_types.empty()) + return std::make_shared(std::make_shared()); + + auto least_supertype = tryGetLeastSupertype(nested_types); + if (!least_supertype) + return nullptr; + + return std::make_shared(least_supertype); } - catch (...) + + /// Tuple + if (checkChar('(', buf)) { - return false; + skipWhitespaceIfAny(buf); + + DataTypes nested_types; + bool first = true; + while (!buf.eof() && *buf.position() != ')') + { + if (!first) + { + skipWhitespaceIfAny(buf); + if (!checkChar(',', buf)) + return nullptr; + skipWhitespaceIfAny(buf); + } + else + first = false; + + auto nested_type = determineDataTypeForSingleFieldImpl(buf); + if (!nested_type) + return nullptr; + + nested_types.push_back(nested_type); + } + + if (buf.eof() || nested_types.empty()) + return nullptr; + + ++buf.position(); + + return std::make_shared(nested_types); } + + /// Map + if (checkChar('{', buf)) + { + skipWhitespaceIfAny(buf); + + DataTypes key_types; + DataTypes value_types; + bool first = true; + while (!buf.eof() && *buf.position() != '}') + { + if (!first) + { + skipWhitespaceIfAny(buf); + if (!checkChar(',', buf)) + return nullptr; + skipWhitespaceIfAny(buf); + } + else + first = false; + + auto key_type = determineDataTypeForSingleFieldImpl(buf); + if (!key_type) + return nullptr; + + key_types.push_back(key_type); + + skipWhitespaceIfAny(buf); + if (!checkChar(':', buf)) + return nullptr; + skipWhitespaceIfAny(buf); + + auto value_type = determineDataTypeForSingleFieldImpl(buf); + if (!value_type) + return nullptr; + + value_types.push_back(value_type); + } + + if (buf.eof()) + return nullptr; + + ++buf.position(); + skipWhitespaceIfAny(buf); + + if (key_types.empty()) + return std::make_shared(std::make_shared(), std::make_shared()); + + auto key_least_supertype = tryGetLeastSupertype(key_types); + + auto value_least_supertype = tryGetLeastSupertype(value_types); + if (!key_least_supertype || !value_least_supertype) + return nullptr; + + if (!DataTypeMap::checkKeyType(key_least_supertype)) + return nullptr; + + return std::make_shared(key_least_supertype, value_least_supertype); + } + + /// String + if (*buf.position() == '\'') + { + ++buf.position(); + while (!buf.eof()) + { + char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end()); + buf.position() = next_pos; + + if (!buf.hasPendingData()) + continue; + + if (*buf.position() == '\'') + break; + + if (*buf.position() == '\\') + ++buf.position(); + } + + if (buf.eof()) + return nullptr; + + ++buf.position(); + return std::make_shared(); + } + + /// Bool + if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf)) + return DataTypeFactory::instance().get("Bool"); + + /// Null + if (checkStringCaseInsensitive("NULL", buf)) + return std::make_shared(); + + /// Number + Float64 tmp; + if (tryReadFloatText(tmp, buf)) + return std::make_shared(); + + return nullptr; } -DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context) +static DataTypePtr determineDataTypeForSingleField(ReadBuffer & buf) +{ + return makeNullableRecursivelyAndCheckForNothing(determineDataTypeForSingleFieldImpl(buf)); +} + +DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule) { switch (escaping_rule) { case FormatSettings::EscapingRule::Quoted: { - DataTypePtr type; - bool parsed = evaluateConstantExpressionFromString(field, type, context); - return parsed ? type : nullptr; + ReadBufferFromString buf(field); + auto type = determineDataTypeForSingleField(buf); + return buf.eof() ? type : nullptr; } case FormatSettings::EscapingRule::JSON: return getDataTypeFromJSONField(field); case FormatSettings::EscapingRule::CSV: { + if (!format_settings.csv.input_format_use_best_effort_in_schema_inference) + return makeNullable(std::make_shared()); + if (field.empty() || field == format_settings.csv.null_representation) return nullptr; if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) - return std::make_shared(); + return DataTypeFactory::instance().get("Nullable(Bool)"); - DataTypePtr type; - bool parsed; - if (field[0] == '\'' || field[0] == '"') + if (field.size() > 1 && ((field.front() == '\'' && field.back() == '\'') || (field.front() == '"' && field.back() == '"'))) { - /// Try to evaluate expression inside quotes. - parsed = evaluateConstantExpressionFromString(StringRef(field.data() + 1, field.size() - 2), type, context); - /// If it's a number in quotes we determine it as a string. - if (parsed && type && isNumber(removeNullable(type))) - return makeNullable(std::make_shared()); - } - else - parsed = evaluateConstantExpressionFromString(field, type, context); + ReadBufferFromString buf(std::string_view(field.data() + 1, field.size() - 2)); + /// Try to determine the type of value inside quotes + auto type = determineDataTypeForSingleField(buf); - /// If we couldn't parse an expression, determine it as a string. - return parsed ? type : makeNullable(std::make_shared()); + if (!type) + return nullptr; + + /// If it's a number or tuple in quotes or there is some unread data in buffer, we determine it as a string. + if (isNumber(removeNullable(type)) || isTuple(type) || !buf.eof()) + return makeNullable(std::make_shared()); + + return type; + } + + /// Case when CSV value is not in quotes. Check if it's a number, and if not, determine it's as a string. + ReadBufferFromString buf(field); + Float64 tmp; + if (tryReadFloatText(tmp, buf) && buf.eof()) + return makeNullable(std::make_shared()); + + return makeNullable(std::make_shared()); } case FormatSettings::EscapingRule::Raw: [[fallthrough]]; case FormatSettings::EscapingRule::Escaped: - /// TODO: Try to use some heuristics here to determine the type of data. - return field.empty() ? nullptr : makeNullable(std::make_shared()); + { + if (!format_settings.tsv.input_format_use_best_effort_in_schema_inference) + return makeNullable(std::make_shared()); + + if (field.empty() || field == format_settings.tsv.null_representation) + return nullptr; + + if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) + return DataTypeFactory::instance().get("Nullable(Bool)"); + + ReadBufferFromString buf(field); + auto type = determineDataTypeForSingleField(buf); + if (!buf.eof()) + return makeNullable(std::make_shared()); + + return type; + } default: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", escapingRuleToString(escaping_rule)); } } -DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context) +DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule) { DataTypes data_types; data_types.reserve(fields.size()); for (const auto & field : fields) - data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule, context)); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule)); return data_types; } @@ -344,4 +534,12 @@ DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escap } } +DataTypes getDefaultDataTypeForEscapingRules(const std::vector & escaping_rules) +{ + DataTypes data_types; + for (const auto & rule : escaping_rules) + data_types.push_back(getDefaultDataTypeForEscapingRule(rule)); + return data_types; +} + } diff --git a/src/Formats/EscapingRuleUtils.h b/src/Formats/EscapingRuleUtils.h index 10147b29ad6..1ce04a8d1b7 100644 --- a/src/Formats/EscapingRuleUtils.h +++ b/src/Formats/EscapingRuleUtils.h @@ -43,15 +43,21 @@ String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule es /// - For JSON escaping rule we can use JSON parser to parse a single field /// and then convert JSON type of this field to ClickHouse type. /// - For CSV escaping rule we can do the next: -/// - If the field is an unquoted string, then we could try to evaluate it -/// as a constant expression, and if it fails, treat it as a String. -/// - If the field is a string in quotes, then we can try to evaluate -/// expression inside quotes as a constant expression, and if it fails or -/// the result is a number (we don't parse numbers in quotes) we treat it as a String. -/// - For TSV and TSVRaw we treat each field as a String (TODO: try to use some tweaks and heuristics here) -DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr); -DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr); +/// - If the field is an unquoted string, then we try to parse it as a number, +/// and if we cannot, treat it as a String. +/// - If the field is a string in quotes, then we try to use some +/// tweaks and heuristics to determine the type inside quotes, and if we can't or +/// the result is a number or tuple (we don't parse numbers in quotes and don't +/// support tuples in CSV) we treat it as a String. +/// - If input_format_csv_use_best_effort_in_schema_inference is disabled, we +/// treat everything as a string. +/// - For TSV and TSVRaw we try to use some tweaks and heuristics to determine the type +/// of value if setting input_format_tsv_use_best_effort_in_schema_inference is enabled, +/// otherwise we treat everything as a string. +DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule); +DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule); DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule); +DataTypes getDefaultDataTypeForEscapingRules(const std::vector & escaping_rules); } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 9dbce146ffa..8f9f600f594 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -65,6 +65,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number; format_settings.csv.null_representation = settings.format_csv_null_representation; format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv; + format_settings.csv.input_format_use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; @@ -97,6 +98,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.parquet.import_nested = settings.input_format_parquet_import_nested; format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching; format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns; + format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference; format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8; format_settings.pretty.color = settings.output_format_pretty_color; format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width; @@ -117,6 +119,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default; format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number; format_settings.tsv.null_representation = settings.format_tsv_null_representation; + format_settings.tsv.input_format_use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference; format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals; format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; @@ -126,10 +129,17 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary; format_settings.arrow.import_nested = settings.input_format_arrow_import_nested; format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns; + format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.orc.import_nested = settings.input_format_orc_import_nested; + format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; + format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; + format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference; format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching; format_settings.orc.import_nested = settings.input_format_orc_import_nested; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; + format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching; format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; @@ -137,6 +147,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns; format_settings.msgpack.output_uuid_representation = settings.output_format_msgpack_uuid_representation; format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference; + format_settings.column_names_for_schema_inference = settings.column_names_for_schema_inference; /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context if (format_settings.schema.is_server) @@ -371,7 +382,7 @@ SchemaReaderPtr FormatFactory::getSchemaReader( throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR); auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); - return schema_reader_creator(buf, format_settings, context); + return schema_reader_creator(buf, format_settings); } ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader( diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index 344dabd3f4d..2f53da3bdff 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -97,7 +97,7 @@ private: /// The checker should return true if format support append. using AppendSupportChecker = std::function; - using SchemaReaderCreator = std::function; + using SchemaReaderCreator = std::function; using ExternalSchemaReaderCreator = std::function; struct Creators diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 608afeb8a2c..6da13a6b032 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -36,6 +36,8 @@ struct FormatSettings bool seekable_read = true; UInt64 max_rows_to_read_for_schema_inference = 100; + String column_names_for_schema_inference = ""; + enum class DateTimeInputFormat { Basic, /// Default format for fast parsing: YYYY-MM-DD hh:mm:ss (ISO-8601 without fractional part and timezone) or NNNNNNNNNN unix timestamp. @@ -77,6 +79,7 @@ struct FormatSettings bool low_cardinality_as_dictionary = false; bool import_nested = false; bool allow_missing_columns = false; + bool skip_columns_with_unsupported_types_in_schema_inference = false; bool case_insensitive_column_matching = false; } arrow; @@ -104,6 +107,7 @@ struct FormatSettings bool input_format_arrays_as_nested_csv = false; String null_representation = "\\N"; char tuple_delimiter = ','; + bool input_format_use_best_effort_in_schema_inference = true; } csv; struct HiveText @@ -141,6 +145,7 @@ struct FormatSettings UInt64 row_group_size = 1000000; bool import_nested = false; bool allow_missing_columns = false; + bool skip_columns_with_unsupported_types_in_schema_inference = false; bool case_insensitive_column_matching = false; std::unordered_set skip_row_groups = {}; } parquet; @@ -209,6 +214,7 @@ struct FormatSettings bool crlf_end_of_line = false; String null_representation = "\\N"; bool input_format_enum_as_number = false; + bool input_format_use_best_effort_in_schema_inference = true; } tsv; struct @@ -223,6 +229,7 @@ struct FormatSettings bool import_nested = false; bool allow_missing_columns = false; int64_t row_batch_size = 100'000; + bool skip_columns_with_unsupported_types_in_schema_inference = false; bool case_insensitive_column_matching = false; std::unordered_set skip_stripes = {}; } orc; diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 8e2531e2006..3e88b51152d 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -105,8 +105,11 @@ ColumnsDescription readSchemaFromFormat(const String & format_name, const std::o return readSchemaFromFormat(format_name, format_settings, read_buffer_creator, context, buf_out); } -DataTypePtr generalizeDataType(DataTypePtr type) +DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type) { + if (!type) + return nullptr; + WhichDataType which(type); if (which.isNothing()) @@ -115,16 +118,13 @@ DataTypePtr generalizeDataType(DataTypePtr type) if (which.isNullable()) { const auto * nullable_type = assert_cast(type.get()); - return generalizeDataType(nullable_type->getNestedType()); + return makeNullableRecursivelyAndCheckForNothing(nullable_type->getNestedType()); } - if (isNumber(type)) - return makeNullable(std::make_shared()); - if (which.isArray()) { const auto * array_type = assert_cast(type.get()); - auto nested_type = generalizeDataType(array_type->getNestedType()); + auto nested_type = makeNullableRecursivelyAndCheckForNothing(array_type->getNestedType()); return nested_type ? std::make_shared(nested_type) : nullptr; } @@ -134,7 +134,7 @@ DataTypePtr generalizeDataType(DataTypePtr type) DataTypes nested_types; for (const auto & element : tuple_type->getElements()) { - auto nested_type = generalizeDataType(element); + auto nested_type = makeNullableRecursivelyAndCheckForNothing(element); if (!nested_type) return nullptr; nested_types.push_back(nested_type); @@ -145,19 +145,27 @@ DataTypePtr generalizeDataType(DataTypePtr type) if (which.isMap()) { const auto * map_type = assert_cast(type.get()); - auto key_type = removeNullable(generalizeDataType(map_type->getKeyType())); - auto value_type = generalizeDataType(map_type->getValueType()); - return key_type && value_type ? std::make_shared(key_type, value_type) : nullptr; + auto key_type = makeNullableRecursivelyAndCheckForNothing(map_type->getKeyType()); + auto value_type = makeNullableRecursivelyAndCheckForNothing(map_type->getValueType()); + return key_type && value_type ? std::make_shared(removeNullable(key_type), value_type) : nullptr; } if (which.isLowCarnality()) { const auto * lc_type = assert_cast(type.get()); - auto nested_type = generalizeDataType(lc_type->getDictionaryType()); + auto nested_type = makeNullableRecursivelyAndCheckForNothing(lc_type->getDictionaryType()); return nested_type ? std::make_shared(nested_type) : nullptr; } return makeNullable(type); } +NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header) +{ + NamesAndTypesList result; + for (auto & [name, type] : header.getNamesAndTypesList()) + result.emplace_back(name, makeNullableRecursivelyAndCheckForNothing(type)); + return result; +} + } diff --git a/src/Formats/ReadSchemaUtils.h b/src/Formats/ReadSchemaUtils.h index 4446393a581..ea8ebbad4c0 100644 --- a/src/Formats/ReadSchemaUtils.h +++ b/src/Formats/ReadSchemaUtils.h @@ -29,14 +29,16 @@ ColumnsDescription readSchemaFromFormat( ContextPtr context, std::unique_ptr & buf_out); -/// Convert type to the most general type: -/// - IntN, UIntN, FloatN, Decimal -> Float64 +/// Make type Nullable recursively: /// - Type -> Nullable(type) /// - Array(Type) -> Array(Nullable(Type)) /// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN)) /// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType)) /// - LowCardinality(Type) -> LowCardinality(Nullable(Type)) /// If type is Nothing or one of the nested types is Nothing, return nullptr. -DataTypePtr generalizeDataType(DataTypePtr type); +DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type); +/// Call makeNullableRecursivelyAndCheckForNothing for all types +/// in the block and return names and types. +NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header); } diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index e086f16be54..98a33612aa8 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -1366,6 +1366,7 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) /// - Tuples: (...) /// - Maps: {...} /// - NULL + /// - Bool: true/false /// - Number: integer, float, decimal. if (*buf.position() == '\'') @@ -1394,6 +1395,16 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) s.append("NaN"); } } + else if (checkCharCaseInsensitive('t', buf)) + { + assertStringCaseInsensitive("rue", buf); + s.append("true"); + } + else if (checkCharCaseInsensitive('f', buf)) + { + assertStringCaseInsensitive("alse", buf); + s.append("false"); + } else { /// It's an integer, float or decimal. They all can be parsed as float. diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 7cdb0644a9d..2e045466ae8 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB { @@ -66,9 +67,32 @@ static void checkTypeAndAppend(NamesAndTypesList & result, DataTypePtr & type, c result.emplace_back(name, type); } -IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_, bool allow_bools_as_numbers_) - : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_), allow_bools_as_numbers(allow_bools_as_numbers_) +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, bool allow_bools_as_numbers_) + : ISchemaReader(in_), max_rows_to_read(format_settings.max_rows_to_read_for_schema_inference), allow_bools_as_numbers(allow_bools_as_numbers_) { + if (!format_settings.column_names_for_schema_inference.empty()) + { + /// column_names_for_schema_inference is a string in format 'column1,column2,column3,...' + boost::split(column_names, format_settings.column_names_for_schema_inference, boost::is_any_of(",")); + for (auto & column_name : column_names) + { + std::string col_name_trimmed = boost::trim_copy(column_name); + if (!col_name_trimmed.empty()) + column_name = col_name_trimmed; + } + } +} + +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_, bool allow_bools_as_numbers_) + : IRowSchemaReader(in_, format_settings, allow_bools_as_numbers_) +{ + default_type = default_type_; +} + +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, const DataTypes & default_types_, bool allow_bools_as_numbers_) + : IRowSchemaReader(in_, format_settings, allow_bools_as_numbers_) +{ + default_types = default_types_; } NamesAndTypesList IRowSchemaReader::readSchema() @@ -90,7 +114,7 @@ NamesAndTypesList IRowSchemaReader::readSchema() if (!new_data_types[i]) continue; - chooseResultType(data_types[i], new_data_types[i], allow_bools_as_numbers, default_type, std::to_string(i + 1), row); + chooseResultType(data_types[i], new_data_types[i], allow_bools_as_numbers, getDefaultType(i), std::to_string(i + 1), row); } } @@ -115,12 +139,21 @@ NamesAndTypesList IRowSchemaReader::readSchema() for (size_t i = 0; i != data_types.size(); ++i) { /// Check that we could determine the type of this column. - checkTypeAndAppend(result, data_types[i], column_names[i], default_type, max_rows_to_read); + checkTypeAndAppend(result, data_types[i], column_names[i], getDefaultType(i), max_rows_to_read); } return result; } +DataTypePtr IRowSchemaReader::getDefaultType(size_t column) const +{ + if (default_type) + return default_type; + if (column < default_types.size() && default_types[column]) + return default_types[column]; + return nullptr; +} + IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_, bool allow_bools_as_numbers_) : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_), allow_bools_as_numbers(allow_bools_as_numbers_) { diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 7e809d3d963..a9697374bc0 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -31,12 +31,17 @@ protected: /// Base class for schema inference for formats that read data row by row. /// It reads data row by row (up to max_rows_to_read), determines types of columns /// for each row and compare them with types from the previous rows. If some column -/// contains values with different types in different rows, the default type will be -/// used for this column or the exception will be thrown (if default type is not set). +/// contains values with different types in different rows, the default type +/// (from argument default_type_) will be used for this column or the exception +/// will be thrown (if default type is not set). If different columns have different +/// default types, you can provide them by default_types_ argument. class IRowSchemaReader : public ISchemaReader { public: - IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr, bool allow_bools_as_numbers_ = false); + IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, bool allow_bools_as_numbers_ = false); + IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_, bool allow_bools_as_numbers_ = false); + IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, const DataTypes & default_types_, bool allow_bools_as_numbers_ = false); + NamesAndTypesList readSchema() override; protected: @@ -49,8 +54,11 @@ protected: void setColumnNames(const std::vector & names) { column_names = names; } private: + + DataTypePtr getDefaultType(size_t column) const; size_t max_rows_to_read; DataTypePtr default_type; + DataTypes default_types; bool allow_bools_as_numbers; std::vector column_names; }; diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp index 37a107ae367..792ebd09392 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp @@ -3,6 +3,7 @@ #if USE_ARROW #include +#include #include #include #include @@ -171,8 +172,9 @@ NamesAndTypesList ArrowSchemaReader::readSchema() schema = createFileReader(in, format_settings, is_stopped)->schema(); } - auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, stream ? "ArrowStream" : "Arrow"); - return header.getNamesAndTypesList(); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader( + *schema, stream ? "ArrowStream" : "Arrow", format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference); + return getNamesAndRecursivelyNullableTypes(header); } void registerInputFormatArrow(FormatFactory & factory) @@ -202,13 +204,13 @@ void registerArrowSchemaReader(FormatFactory & factory) { factory.registerSchemaReader( "Arrow", - [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, false, settings); }); factory.registerSchemaReader( "ArrowStream", - [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, true, settings); });} diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 0a72e561e4e..c792d828e44 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -26,11 +27,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include /// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn. @@ -329,12 +332,17 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( const std::string & format_name, bool is_nullable, std::unordered_map> & dictionary_values, - bool read_ints_as_dates) + bool read_ints_as_dates, + bool allow_null_type, + bool skip_columns_with_unsupported_types, + bool & skipped) { if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST && arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT) { - auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates); + auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); + if (skipped) + return {}; auto nullmap_column = readByteMapFromArrowColumn(arrow_column); auto nullable_type = std::make_shared(std::move(nested_column.type)); auto nullable_column = ColumnNullable::create(nested_column.column, nullmap_column); @@ -379,7 +387,10 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::MAP: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); + if (skipped) + return {}; + auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); const auto * tuple_column = assert_cast(nested_column.column.get()); @@ -391,7 +402,9 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::LIST: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); + if (skipped) + return {}; auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); auto array_column = ColumnArray::create(nested_column.column, offsets_column); auto array_type = std::make_shared(nested_column.type); @@ -416,7 +429,9 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( for (int i = 0; i != arrow_struct_type->num_fields(); ++i) { auto nested_arrow_column = std::make_shared(nested_arrow_columns[i]); - auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates); + auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); + if (skipped) + return {}; tuple_elements.emplace_back(std::move(element.column)); tuple_types.emplace_back(std::move(element.type)); tuple_names.emplace_back(std::move(element.name)); @@ -439,7 +454,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( dict_array.emplace_back(dict_chunk.dictionary()); } auto arrow_dict_column = std::make_shared(dict_array); - auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); + auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped); /// We should convert read column to ColumnUnique. auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn(); @@ -469,9 +484,33 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( # undef DISPATCH // TODO: read JSON as a string? // TODO: read UUID as a string? + case arrow::Type::NA: + { + if (allow_null_type) + { + auto type = std::make_shared(); + auto column = ColumnNothing::create(arrow_column->length()); + return {std::move(column), type, column_name}; + } + [[fallthrough]]; + } default: - throw Exception(ErrorCodes::UNKNOWN_TYPE, - "Unsupported {} type '{}' of an input column '{}'.", format_name, arrow_column->type()->name(), column_name); + { + if (skip_columns_with_unsupported_types) + { + skipped = true; + return {}; + } + + throw Exception( + ErrorCodes::UNKNOWN_TYPE, + "Unsupported {} type '{}' of an input column '{}'. If it happens during schema inference and you want to skip columns with " + "unsupported types, you can enable setting input_format_{}_skip_columns_with_unsupported_types_in_schema_inference", + format_name, + arrow_column->type()->name(), + column_name, + boost::algorithm::to_lower_copy(format_name)); + } } } @@ -485,8 +524,9 @@ static void checkStatus(const arrow::Status & status, const String & column_name throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()}; } + Block ArrowColumnToCHColumn::arrowSchemaToCHHeader( - const arrow::Schema & schema, const std::string & format_name, const Block * hint_header, bool ignore_case) + const arrow::Schema & schema, const std::string & format_name, bool skip_columns_with_unsupported_types, const Block * hint_header, bool ignore_case) { ColumnsWithTypeAndName sample_columns; std::unordered_set nested_table_names; @@ -512,9 +552,14 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader( arrow::ArrayVector array_vector = {arrow_array}; auto arrow_column = std::make_shared(array_vector); std::unordered_map> dict_values; - ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false); - - sample_columns.emplace_back(std::move(sample_column)); + bool skipped = false; + bool allow_null_type = false; + if (hint_header && hint_header->has(field->name()) && hint_header->getByName(field->name()).type->isNullable()) + allow_null_type = true; + ColumnWithTypeAndName sample_column = readColumnFromArrowColumn( + arrow_column, field->name(), format_name, false, dict_values, false, allow_null_type, skip_columns_with_unsupported_types, skipped); + if (!skipped) + sample_columns.emplace_back(std::move(sample_column)); } return Block(std::move(sample_columns)); } @@ -559,6 +604,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & UInt64 num_rows = name_to_column_ptr.begin()->second->length(); columns_list.reserve(header.rows()); std::unordered_map nested_tables; + bool skipped = false; for (size_t column_i = 0, columns = header.columns(); column_i < columns; ++column_i) { const ColumnWithTypeAndName & header_column = header.getByPosition(column_i); @@ -582,7 +628,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & { std::shared_ptr arrow_column = name_to_column_ptr[search_nested_table_name]; ColumnsWithTypeAndName cols - = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)}; + = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true, true, false, skipped)}; Block block(cols); nested_tables[search_nested_table_name] = std::make_shared(Nested::flatten(block)); } @@ -615,7 +661,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & else { auto arrow_column = name_to_column_ptr[search_column_name]; - column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true); + column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true, true, false, skipped); } try @@ -642,7 +688,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & std::vector ArrowColumnToCHColumn::getMissingColumns(const arrow::Schema & schema) const { std::vector missing_columns; - auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, &header, case_insensitive_matching); + auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, false, &header, case_insensitive_matching); auto flatten_block_from_arrow = Nested::flatten(block_from_arrow); for (size_t i = 0, columns = header.columns(); i < columns; ++i) diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index 0a712326941..695e14b7bba 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -38,7 +38,11 @@ public: /// Transform arrow schema to ClickHouse header. If hint_header is provided, /// we will skip columns in schema that are not in hint_header. static Block arrowSchemaToCHHeader( - const arrow::Schema & schema, const std::string & format_name, const Block * hint_header = nullptr, bool ignore_case = false); + const arrow::Schema & schema, + const std::string & format_name, + bool skip_columns_with_unsupported_types = false, + const Block * hint_header = nullptr, + bool ignore_case = false); private: const Block & header; diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index a372df41344..29429650c19 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -924,12 +924,12 @@ void registerInputFormatAvro(FormatFactory & factory) void registerAvroSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, false, settings); }); - factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, true, settings); }); diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp index 6918220feb4..d3de2fbf494 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp @@ -95,7 +95,7 @@ void BinaryFormatReader::skipField(size_t file_column) } BinaryWithNamesAndTypesSchemaReader::BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) - : FormatWithNamesAndTypesSchemaReader(in_, 0, true, true, &reader), reader(in_, format_settings_) + : FormatWithNamesAndTypesSchemaReader(in_, format_settings_, true, true, &reader), reader(in_, format_settings_) { } @@ -119,7 +119,7 @@ void registerInputFormatRowBinary(FormatFactory & factory) void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, settings); }); diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 216ec6b295a..f246d5c0a35 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include @@ -259,16 +258,15 @@ bool CSVFormatReader::readField( } -CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_) +CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_) : FormatWithNamesAndTypesSchemaReader( in_, - format_setting_.max_rows_to_read_for_schema_inference, + format_setting_, with_names_, with_types_, &reader, getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV)) , reader(in_, format_setting_) - , context(context_) { } @@ -279,7 +277,7 @@ DataTypes CSVSchemaReader::readRowAndGetDataTypes() return {}; auto fields = reader.readRow(); - return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV, context); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV); } @@ -382,9 +380,9 @@ void registerCSVSchemaReader(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings) { - return std::make_shared(buf, with_names, with_types, settings, context); + return std::make_shared(buf, with_names, with_types, settings); }); }; diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index ad9f6c4e492..ee45264d573 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -74,13 +74,12 @@ public: class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader { public: - CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_); + CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_); private: DataTypes readRowAndGetDataTypes() override; CSVFormatReader reader; - ContextPtr context; }; } diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index d2e0d6e21a9..74c5fb1945a 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -289,17 +289,16 @@ void CustomSeparatedFormatReader::setReadBuffer(ReadBuffer & in_) } CustomSeparatedSchemaReader::CustomSeparatedSchemaReader( - ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_) + ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_) : FormatWithNamesAndTypesSchemaReader( buf, - format_setting_.max_rows_to_read_for_schema_inference, + format_setting_, with_names_, with_types_, &reader, getDefaultDataTypeForEscapingRule(format_setting_.custom.escaping_rule)) , buf(in_) , reader(buf, ignore_spaces_, updateFormatSettings(format_setting_)) - , context(context_) { } @@ -315,7 +314,7 @@ DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypes() first_row = false; auto fields = reader.readRow(); - return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), context); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule()); } void registerInputFormatCustomSeparated(FormatFactory & factory) @@ -343,9 +342,9 @@ void registerCustomSeparatedSchemaReader(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings) { - return std::make_shared(buf, with_names, with_types, ignore_spaces, settings, context); + return std::make_shared(buf, with_names, with_types, ignore_spaces, settings); }); }; diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h index a2f4509d307..d9e62a1b8e9 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h @@ -92,14 +92,13 @@ private: class CustomSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader { public: - CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_); + CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_); private: DataTypes readRowAndGetDataTypes() override; PeekableReadBuffer buf; CustomSeparatedFormatReader reader; - ContextPtr context; bool first_row = true; }; diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index 0496e3e41a8..854aefc7562 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -181,15 +181,10 @@ bool JSONCompactEachRowFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & return true; } -JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) +JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader( + ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesSchemaReader( - in_, - format_settings_.max_rows_to_read_for_schema_inference, - with_names_, - with_types_, - &reader, - nullptr, - format_settings_.json.read_bools_as_numbers) + in_, format_settings_, with_names_, with_types_, &reader, nullptr, format_settings_.json.read_bools_as_numbers) , reader(in_, yield_strings_, format_settings_) { } @@ -239,7 +234,7 @@ void registerJSONCompactEachRowSchemaReader(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, with_names, with_types, json_strings, settings); }); diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 2eb5b143107..674bad5a13d 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -387,12 +387,12 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory void registerJSONEachRowSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_unique(buf, false, settings); }); - factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_unique(buf, true, settings); }); diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index 722cedbab30..48502e7af3a 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -414,7 +414,7 @@ void MsgPackRowInputFormat::setReadBuffer(ReadBuffer & in_) } MsgPackSchemaReader::MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) - : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns) + : IRowSchemaReader(buf, format_settings_), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns) { if (!number_of_columns) throw Exception(ErrorCodes::BAD_ARGUMENTS, "You must specify setting input_format_msgpack_number_of_columns to extract table schema from MsgPack data"); @@ -535,7 +535,7 @@ void registerInputFormatMsgPack(FormatFactory & factory) void registerMsgPackSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, settings); }); diff --git a/src/Processors/Formats/Impl/NativeFormat.cpp b/src/Processors/Formats/Impl/NativeFormat.cpp index bd95cfd6376..c1dc60022f5 100644 --- a/src/Processors/Formats/Impl/NativeFormat.cpp +++ b/src/Processors/Formats/Impl/NativeFormat.cpp @@ -133,7 +133,7 @@ void registerOutputFormatNative(FormatFactory & factory) void registerNativeSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &, ContextPtr) + factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &) { return std::make_shared(buf); }); diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index 8b8426132de..333129aee81 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -3,6 +3,7 @@ #if USE_ORC #include +#include #include #include #include @@ -183,8 +184,9 @@ NamesAndTypesList ORCSchemaReader::readSchema() std::shared_ptr schema; std::atomic is_stopped = 0; getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped); - auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "ORC"); - return header.getNamesAndTypesList(); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader( + *schema, "ORC", format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference); + return getNamesAndRecursivelyNullableTypes(header); } void registerInputFormatORC(FormatFactory & factory) @@ -205,7 +207,7 @@ void registerORCSchemaReader(FormatFactory & factory) { factory.registerSchemaReader( "ORC", - [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, settings); } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 69e51e0dad2..af16d30bcfe 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -4,6 +4,7 @@ #if USE_PARQUET #include +#include #include #include #include @@ -176,8 +177,9 @@ NamesAndTypesList ParquetSchemaReader::readSchema() std::shared_ptr schema; std::atomic is_stopped = 0; getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped); - auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "Parquet"); - return header.getNamesAndTypesList(); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader( + *schema, "Parquet", format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference); + return getNamesAndRecursivelyNullableTypes(header); } void registerInputFormatParquet(FormatFactory & factory) @@ -198,7 +200,7 @@ void registerParquetSchemaReader(FormatFactory & factory) { factory.registerSchemaReader( "Parquet", - [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, settings); } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index 4754b70d375..f18b6b0aaab 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -128,15 +128,14 @@ void RegexpRowInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } -RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_) +RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : IRowSchemaReader( buf, - format_settings_.max_rows_to_read_for_schema_inference, + format_settings_, getDefaultDataTypeForEscapingRule(format_settings_.regexp.escaping_rule)) , format_settings(format_settings_) , field_extractor(format_settings) , buf(in_) - , context(context_) { } @@ -152,7 +151,7 @@ DataTypes RegexpSchemaReader::readRowAndGetDataTypes() for (size_t i = 0; i != field_extractor.getMatchedFieldsSize(); ++i) { String field(field_extractor.getField(i)); - data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule, context)); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule)); } return data_types; @@ -203,9 +202,9 @@ void registerFileSegmentationEngineRegexp(FormatFactory & factory) void registerRegexpSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings) { - return std::make_shared(buf, settings, context); + return std::make_shared(buf, settings); }); } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.h b/src/Processors/Formats/Impl/RegexpRowInputFormat.h index 04f24bbb3e4..3cc6a3192fd 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.h +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.h @@ -76,7 +76,7 @@ private: class RegexpSchemaReader : public IRowSchemaReader { public: - RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_); + RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings); private: DataTypes readRowAndGetDataTypes() override; @@ -85,7 +85,6 @@ private: const FormatSettings format_settings; RegexpFieldExtractor field_extractor; PeekableReadBuffer buf; - ContextPtr context; }; } diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index ceea174c0e8..26c7d1aced5 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -283,7 +283,7 @@ void registerInputFormatTSKV(FormatFactory & factory) } void registerTSKVSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, settings); }); diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index bb844ec68ea..b6c9438a57c 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -235,7 +235,7 @@ TabSeparatedSchemaReader::TabSeparatedSchemaReader( ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesSchemaReader( in_, - format_settings_.max_rows_to_read_for_schema_inference, + format_settings_, with_names_, with_types_, &reader, @@ -280,7 +280,7 @@ void registerTSVSchemaReader(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings) { return std::make_shared(buf, with_names, with_types, is_raw, settings); }); diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 06d6ba06bcc..df4d49b172c 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -453,14 +453,12 @@ TemplateSchemaReader::TemplateSchemaReader( const ParsedTemplateFormatString & format_, const ParsedTemplateFormatString & row_format_, std::string row_between_delimiter, - const FormatSettings & format_settings_, - ContextPtr context_) - : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference) + const FormatSettings & format_settings_) + : IRowSchemaReader(buf, format_settings_, getDefaultDataTypeForEscapingRules(row_format_.escaping_rules)) , buf(in_) , format(format_) , row_format(row_format_) , format_settings(format_settings_) - , context(context_) , format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings) { setColumnNames(row_format.column_names); @@ -489,7 +487,7 @@ DataTypes TemplateSchemaReader::readRowAndGetDataTypes() format_settings.csv.delimiter = row_format.delimiters[i + 1].empty() ? format_settings.csv.delimiter : row_format.delimiters[i + 1].front(); field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings); - data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], context)); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i])); } format_reader.skipRowEndDelimiter(); @@ -564,12 +562,12 @@ void registerTemplateSchemaReader(FormatFactory & factory) { for (bool ignore_spaces : {false, true}) { - factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings) { size_t index = 0; auto idx_getter = [&](const String &) -> std::optional { return index++; }; auto row_format = fillRowFormat(settings, idx_getter, false); - return std::make_shared(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings, context); + return std::make_shared(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings); }); } } diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/src/Processors/Formats/Impl/TemplateRowInputFormat.h index b5ced707ace..ab7043f057e 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -116,8 +116,7 @@ public: const ParsedTemplateFormatString & format_, const ParsedTemplateFormatString & row_format_, std::string row_between_delimiter, - const FormatSettings & format_settings_, - ContextPtr context_); + const FormatSettings & format_settings_); DataTypes readRowAndGetDataTypes() override; @@ -126,7 +125,6 @@ private: const ParsedTemplateFormatString format; const ParsedTemplateFormatString row_format; FormatSettings format_settings; - ContextPtr context; TemplateFormatReader format_reader; bool first_row = true; }; diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index bf8feb077ed..e8b4c69bd19 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -571,8 +572,8 @@ void ValuesBlockInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } -ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_) - : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), context(context_) +ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : IRowSchemaReader(buf, format_settings_), buf(in_), format_settings(format_settings_) { } @@ -589,38 +590,25 @@ DataTypes ValuesSchemaReader::readRowAndGetDataTypes() return {}; assertChar('(', buf); - PeekableReadBufferCheckpoint checkpoint(buf); - skipToNextRow(&buf, 0, 1); - buf.makeContinuousMemoryFromCheckpointToPos(); - buf.rollbackToCheckpoint(); - - Tokens tokens(buf.position(), buf.buffer().end()); - IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); - + skipWhitespaceIfAny(buf); DataTypes data_types; - bool finish = false; - while (!finish) + String value; + while (!buf.eof() && *buf.position() != ')') { - Expected expected; - ASTPtr ast; + if (!data_types.empty()) + { + skipWhitespaceIfAny(buf); + assertChar(',', buf); + skipWhitespaceIfAny(buf); + } - bool parsed = parser.parse(token_iterator, ast, expected); - /// Consider delimiter after value (',' or ')') as part of expression - parsed &= token_iterator->type == TokenType::Comma || token_iterator->type == TokenType::ClosingRoundBracket; - - if (!parsed) - throw Exception(ErrorCodes::SYNTAX_ERROR, "Cannot parse expression here: {}, token: {}", - String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())), String(token_iterator.get().begin, token_iterator.get().end)); - - std::pair result = evaluateConstantExpression(ast, context); - data_types.push_back(generalizeDataType(result.second)); - - if (token_iterator->type == TokenType::ClosingRoundBracket) - finish = true; - ++token_iterator; - buf.position() = const_cast(token_iterator->begin); + readQuotedFieldIntoString(value, buf); + auto type = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted); + data_types.push_back(std::move(type)); } + assertChar(')', buf); + skipWhitespaceIfAny(buf); if (!buf.eof() && *buf.position() == ',') ++buf.position(); @@ -642,9 +630,9 @@ void registerInputFormatValues(FormatFactory & factory) void registerValuesSchemaReader(FormatFactory & factory) { - factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings) { - return std::make_shared(buf, settings, context); + return std::make_shared(buf, settings); }); } diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index e1521955472..77967181566 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -97,13 +97,13 @@ private: class ValuesSchemaReader : public IRowSchemaReader { public: - ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_); + ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings); private: DataTypes readRowAndGetDataTypes() override; PeekableReadBuffer buf; - ContextPtr context; + const FormatSettings format_settings; ParserExpression parser; bool first_row = true; }; diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 4e67007c943..2bc8409c70e 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -293,13 +293,13 @@ void RowInputFormatWithNamesAndTypes::setReadBuffer(ReadBuffer & in_) FormatWithNamesAndTypesSchemaReader::FormatWithNamesAndTypesSchemaReader( ReadBuffer & in_, - size_t max_rows_to_read_, + const FormatSettings & format_settings, bool with_names_, bool with_types_, FormatWithNamesAndTypesReader * format_reader_, DataTypePtr default_type_, bool allow_bools_as_numbers_) - : IRowSchemaReader(in_, max_rows_to_read_, default_type_, allow_bools_as_numbers_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_) + : IRowSchemaReader(in_, format_settings, default_type_, allow_bools_as_numbers_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_) { } diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index 8fbd426112c..6f64c6ad4a2 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -124,7 +124,7 @@ class FormatWithNamesAndTypesSchemaReader : public IRowSchemaReader public: FormatWithNamesAndTypesSchemaReader( ReadBuffer & in, - size_t max_rows_to_read_, + const FormatSettings & format_settings, bool with_names_, bool with_types_, FormatWithNamesAndTypesReader * format_reader_, diff --git a/tests/queries/0_stateless/01801_s3_cluster.reference b/tests/queries/0_stateless/01801_s3_cluster.reference index 31c97f14fa3..0448ff3933b 100644 --- a/tests/queries/0_stateless/01801_s3_cluster.reference +++ b/tests/queries/0_stateless/01801_s3_cluster.reference @@ -2,30 +2,6 @@ 0 0 0 0 0 0 1 2 3 -10 11 12 -13 14 15 -16 17 18 -20 21 22 -23 24 25 -26 27 28 -4 5 6 -7 8 9 -0 0 0 -0 0 0 -0 0 0 -1 2 3 -10 11 12 -13 14 15 -16 17 18 -20 21 22 -23 24 25 -26 27 28 -4 5 6 -7 8 9 -0 0 0 -0 0 0 -0 0 0 -1 2 3 4 5 6 7 8 9 10 11 12 @@ -38,14 +14,26 @@ 0 0 0 0 0 0 1 2 3 +4 5 6 +7 8 9 10 11 12 13 14 15 16 17 18 20 21 22 23 24 25 26 27 28 +0 0 0 +0 0 0 +0 0 0 +1 2 3 4 5 6 7 8 9 +10 11 12 +13 14 15 +16 17 18 +20 21 22 +23 24 25 +26 27 28 0 0 0 0 0 0 0 0 0 @@ -62,14 +50,26 @@ 0 0 0 0 0 0 1 2 3 +4 5 6 +7 8 9 10 11 12 13 14 15 16 17 18 20 21 22 23 24 25 26 27 28 +0 0 0 +0 0 0 +0 0 0 +1 2 3 4 5 6 7 8 9 +10 11 12 +13 14 15 +16 17 18 +20 21 22 +23 24 25 +26 27 28 0 0 0 0 0 0 0 0 0 diff --git a/tests/queries/0_stateless/02149_schema_inference.reference b/tests/queries/0_stateless/02149_schema_inference.reference index e4a0c3c3602..2d7dd5caca7 100644 --- a/tests/queries/0_stateless/02149_schema_inference.reference +++ b/tests/queries/0_stateless/02149_schema_inference.reference @@ -1,17 +1,17 @@ TSV -c1 Nullable(String) +c1 Nullable(Float64) c2 Nullable(String) -c3 Nullable(String) -c4 Nullable(String) -42 Some string [1, 2, 3, 4] (1, 2, 3) -42 abcd [] (4, 5, 6) +c3 Array(Nullable(Float64)) +c4 Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)) +42 Some string [1,2,3,4] (1,2,3) +42 abcd [] (4,5,6) TSVWithNames -number Nullable(String) +number Nullable(Float64) string Nullable(String) -array Nullable(String) -tuple Nullable(String) -42 Some string [1, 2, 3, 4] (1, 2, 3) -42 abcd [] (4, 5, 6) +array Array(Nullable(Float64)) +tuple Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)) +42 Some string [1,2,3,4] (1,2,3) +42 abcd [] (4,5,6) CSV c1 Nullable(Float64) c2 Nullable(String) @@ -73,13 +73,13 @@ c Array(Nullable(Float64)) \N \N [] \N \N [3] TSKV -a Nullable(String) +a Nullable(Float64) b Nullable(String) -c Nullable(String) -1 s1 \N +c Array(Nullable(Float64)) +1 s1 [] 2 } [2] -\N \N \N -\N \N \N +\N \N [] +\N \N [] \N \N [3] Values c1 Nullable(Float64) @@ -96,7 +96,7 @@ c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(Strin 42.42 \N [1,NULL,3] (1,NULL) ([1,2],[(3,'4'),(5,'6')]) \N Some string [10] (1,2) ([],[]) Regexp -c1 Nullable(String) +c1 Nullable(Float64) c2 Nullable(String) c3 Nullable(String) 42 Some string 1 [([1, 2, 3], String 1), ([], String 1)] diff --git a/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference index d3d2d86d696..b0ec4bef499 100644 --- a/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference +++ b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference @@ -1,137 +1,137 @@ Arrow -int8 Int8 -uint8 UInt8 -int16 Int16 -uint16 UInt16 -int32 Int32 -uint32 UInt32 -int64 Int64 -uint64 UInt64 +int8 Nullable(Int8) +uint8 Nullable(UInt8) +int16 Nullable(Int16) +uint16 Nullable(UInt16) +int32 Nullable(Int32) +uint32 Nullable(UInt32) +int64 Nullable(Int64) +uint64 Nullable(UInt64) 0 0 0 0 0 0 0 0 -1 1 -1 1 -1 1 -1 1 -float32 Float32 -float64 Float64 -decimal32 Decimal(9, 5) -decimal64 Decimal(18, 5) +float32 Nullable(Float32) +float64 Nullable(Float64) +decimal32 Nullable(Decimal(9, 5)) +decimal64 Nullable(Decimal(18, 5)) 0 0 0 0 1.2 0.7692307692307692 3.33333 333.33333 -date UInt16 -date32 Date32 +date Nullable(UInt16) +date32 Nullable(Date32) 0 1970-01-01 1 1970-01-02 -str String -fixed_string String +str Nullable(String) +fixed_string Nullable(String) Str: 0 100 Str: 1 200 -array Array(UInt64) -tuple Tuple(`tuple.0` UInt64, `tuple.1` String) -map Map(String, UInt64) +array Array(Nullable(UInt64)) +tuple Tuple(Nullable(UInt64), Nullable(String)) +map Map(String, Nullable(UInt64)) [0,1] (0,'0') {'0':0} [1,2] (1,'1') {'1':1} -nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) -nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64)))) +nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8)) [([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) [([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) ArrowStream -int8 Int8 -uint8 UInt8 -int16 Int16 -uint16 UInt16 -int32 Int32 -uint32 UInt32 -int64 Int64 -uint64 UInt64 +int8 Nullable(Int8) +uint8 Nullable(UInt8) +int16 Nullable(Int16) +uint16 Nullable(UInt16) +int32 Nullable(Int32) +uint32 Nullable(UInt32) +int64 Nullable(Int64) +uint64 Nullable(UInt64) 0 0 0 0 0 0 0 0 -1 1 -1 1 -1 1 -1 1 -float32 Float32 -float64 Float64 -decimal32 Decimal(9, 5) -decimal64 Decimal(18, 5) +float32 Nullable(Float32) +float64 Nullable(Float64) +decimal32 Nullable(Decimal(9, 5)) +decimal64 Nullable(Decimal(18, 5)) 0 0 0 0 1.2 0.7692307692307692 3.33333 333.33333 -date UInt16 -date32 Date32 +date Nullable(UInt16) +date32 Nullable(Date32) 0 1970-01-01 1 1970-01-02 -str String -fixed_string String +str Nullable(String) +fixed_string Nullable(String) Str: 0 100 Str: 1 200 -array Array(UInt64) -tuple Tuple(`tuple.0` UInt64, `tuple.1` String) -map Map(String, UInt64) +array Array(Nullable(UInt64)) +tuple Tuple(Nullable(UInt64), Nullable(String)) +map Map(String, Nullable(UInt64)) [0,1] (0,'0') {'0':0} [1,2] (1,'1') {'1':1} -nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) -nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64)))) +nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8)) [([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) [([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) Parquet -int8 Int8 -uint8 UInt8 -int16 Int16 -uint16 UInt16 -int32 Int32 -uint32 Int64 -int64 Int64 -uint64 UInt64 +int8 Nullable(Int8) +uint8 Nullable(UInt8) +int16 Nullable(Int16) +uint16 Nullable(UInt16) +int32 Nullable(Int32) +uint32 Nullable(Int64) +int64 Nullable(Int64) +uint64 Nullable(UInt64) 0 0 0 0 0 0 0 0 -1 1 -1 1 -1 1 -1 1 -float32 Float32 -float64 Float64 -decimal32 Decimal(9, 5) -decimal64 Decimal(18, 5) +float32 Nullable(Float32) +float64 Nullable(Float64) +decimal32 Nullable(Decimal(9, 5)) +decimal64 Nullable(Decimal(18, 5)) 0 0 0 0 1.2 0.7692307692307692 3.33333 333.33333 -date UInt16 -date32 Date32 +date Nullable(UInt16) +date32 Nullable(Date32) 0 1970-01-01 1 1970-01-02 -str String -fixed_string String +str Nullable(String) +fixed_string Nullable(String) Str: 0 100 Str: 1 200 -array Array(UInt64) -tuple Tuple(`tuple.0` UInt64, `tuple.1` String) -map Map(String, UInt64) +array Array(Nullable(UInt64)) +tuple Tuple(Nullable(UInt64), Nullable(String)) +map Map(String, Nullable(UInt64)) [0,1] (0,'0') {'0':0} [1,2] (1,'1') {'1':1} -nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) -nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64)))) +nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8)) [([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) [([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) ORC -int8 Int8 -uint8 Int8 -int16 Int16 -uint16 Int16 -int32 Int32 -uint32 Int32 -int64 Int64 -uint64 Int64 +int8 Nullable(Int8) +uint8 Nullable(Int8) +int16 Nullable(Int16) +uint16 Nullable(Int16) +int32 Nullable(Int32) +uint32 Nullable(Int32) +int64 Nullable(Int64) +uint64 Nullable(Int64) 0 0 0 0 0 0 0 0 -1 1 -1 1 -1 1 -1 1 -float32 Float32 -float64 Float64 -decimal32 Decimal(9, 5) -decimal64 Decimal(18, 5) +float32 Nullable(Float32) +float64 Nullable(Float64) +decimal32 Nullable(Decimal(9, 5)) +decimal64 Nullable(Decimal(18, 5)) 0 0 0 0 1.2 0.7692307692307692 3.33333 333.33333 -date Date32 -date32 Date32 +date Nullable(Date32) +date32 Nullable(Date32) 1970-01-01 1970-01-01 1970-01-02 1970-01-02 -str String -fixed_string String +str Nullable(String) +fixed_string Nullable(String) Str: 0 100 Str: 1 200 -array Array(Int64) -tuple Tuple(`tuple.0` Int64, `tuple.1` String) -map Map(String, Int64) +array Array(Nullable(Int64)) +tuple Tuple(Nullable(Int64), Nullable(String)) +map Map(String, Nullable(Int64)) [0,1] (0,'0') {'0':0} [1,2] (1,'1') {'1':1} -nested1 Array(Tuple(`nested1.0` Array(Int64), `nested1.1` Map(String, Int64))) -nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(Int64)), `nested2.0.1` Map(Int64, Array(Tuple(`nested2.0.1.0` Int64, `nested2.0.1.1` String)))), `nested2.1` Int8) +nested1 Array(Tuple(Array(Nullable(Int64)), Map(String, Nullable(Int64)))) +nested2 Tuple(Tuple(Array(Array(Nullable(Int64))), Map(Int64, Array(Tuple(Nullable(Int64), Nullable(String))))), Nullable(Int8)) [([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) [([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) Native diff --git a/tests/queries/0_stateless/02166_arrow_dictionary_inference.reference b/tests/queries/0_stateless/02166_arrow_dictionary_inference.reference index 46f448cfba7..20f3368e446 100644 --- a/tests/queries/0_stateless/02166_arrow_dictionary_inference.reference +++ b/tests/queries/0_stateless/02166_arrow_dictionary_inference.reference @@ -1 +1 @@ -x LowCardinality(UInt64) +x LowCardinality(Nullable(UInt64)) diff --git a/tests/queries/0_stateless/02166_arrow_dictionary_inference.sh b/tests/queries/0_stateless/02166_arrow_dictionary_inference.sh index e560dc10d2c..7d313b571d9 100755 --- a/tests/queries/0_stateless/02166_arrow_dictionary_inference.sh +++ b/tests/queries/0_stateless/02166_arrow_dictionary_inference.sh @@ -5,7 +5,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -q "insert into table function file('arrow.dict', 'Arrow', 'x LowCardinality(UInt64)') select number from numbers(10) settings output_format_arrow_low_cardinality_as_dictionary=1" +$CLICKHOUSE_CLIENT -q "insert into table function file('arrow.dict', 'Arrow', 'x LowCardinality(UInt64)') select number from numbers(10) settings output_format_arrow_low_cardinality_as_dictionary=1, engine_file_truncate_on_insert=1" $CLICKHOUSE_CLIENT -q "desc file('arrow.dict', 'Arrow')" diff --git a/tests/queries/0_stateless/02211_shcema_inference_from_stdin.reference b/tests/queries/0_stateless/02211_shcema_inference_from_stdin.reference index d176e0ee1ed..6920aa16198 100644 --- a/tests/queries/0_stateless/02211_shcema_inference_from_stdin.reference +++ b/tests/queries/0_stateless/02211_shcema_inference_from_stdin.reference @@ -9,7 +9,7 @@ x Nullable(Float64) 7 8 9 -c1 Nullable(String) -c2 Nullable(String) -c3 Nullable(String) +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(Float64) 1 2 3 diff --git a/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference b/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference index 69ed3536951..d0ced74f8f6 100644 --- a/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference +++ b/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference @@ -1,8 +1,8 @@ -a Nullable(String) +a Nullable(Float64) b Nullable(String) -c Nullable(String) -1 s1 \N +c Array(Nullable(Float64)) +1 s1 [] 2 } [2] -\N \N \N -\N \N \N +\N \N [] +\N \N [] \N \N [3] diff --git a/tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.reference b/tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.reference new file mode 100644 index 00000000000..debc5c58936 --- /dev/null +++ b/tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.reference @@ -0,0 +1,40 @@ +Arrow +x Nullable(UInt64) +arr1 Array(Nullable(UInt64)) +arr2 Array(Array(Nullable(String))) +arr3 Array(Tuple(Nullable(String), Nullable(UInt64))) +0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)] +\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)] +2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)] +\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)] +4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)] +ArrowStream +x Nullable(UInt64) +arr1 Array(Nullable(UInt64)) +arr2 Array(Array(Nullable(String))) +arr3 Array(Tuple(Nullable(String), Nullable(UInt64))) +0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)] +\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)] +2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)] +\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)] +4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)] +Parquet +x Nullable(UInt64) +arr1 Array(Nullable(UInt64)) +arr2 Array(Array(Nullable(String))) +arr3 Array(Tuple(Nullable(String), Nullable(UInt64))) +0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)] +\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)] +2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)] +\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)] +4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)] +ORC +x Nullable(Int64) +arr1 Array(Nullable(Int64)) +arr2 Array(Array(Nullable(String))) +arr3 Array(Tuple(Nullable(String), Nullable(Int64))) +0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)] +\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)] +2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)] +\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)] +4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)] diff --git a/tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.sh b/tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.sh new file mode 100755 index 00000000000..1b6999e3f09 --- /dev/null +++ b/tests/queries/0_stateless/02242_arrow_orc_parquet_nullable_schema_inference.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02242.data +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +for format in Arrow ArrowStream Parquet ORC +do + echo $format + $CLICKHOUSE_CLIENT -q "select number % 2 ? NULL : number as x, [number % 2 ? NULL : number, number + 1] as arr1, [[NULL, 'String'], [NULL], []] as arr2, [(NULL, NULL), ('String', NULL), (NULL, number)] as arr3 from numbers(5) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" +done + +rm $DATA_FILE diff --git a/tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.reference b/tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.reference new file mode 100644 index 00000000000..f599e28b8ab --- /dev/null +++ b/tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.reference @@ -0,0 +1 @@ +10 diff --git a/tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.sh b/tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.sh new file mode 100755 index 00000000000..cc8db7fb316 --- /dev/null +++ b/tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "drop table if exists test_02243" +$CLICKHOUSE_CLIENT -q "create table test_02243 (image_path Nullable(String), + caption Nullable(String), + NSFW Nullable(String), + similarity Nullable(Float64), + LICENSE Nullable(String), + url Nullable(String), + key Nullable(UInt64), + shard_id Nullable(UInt64), + status Nullable(String), + error_message Nullable(String), + width Nullable(UInt32), + height Nullable(UInt32), + exif Nullable(String), + original_width Nullable(UInt32), + original_height Nullable(UInt32)) engine=Memory" + +cat $CUR_DIR/data_parquet_bad_column/metadata_0.parquet | $CLICKHOUSE_CLIENT --stacktrace -q "insert into test_02243 format Parquet" + +$CLICKHOUSE_CLIENT -q "select count() from test_02243" +$CLICKHOUSE_CLIENT -q "drop table test_02243" diff --git a/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.reference b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.reference new file mode 100644 index 00000000000..d237caf630f --- /dev/null +++ b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.reference @@ -0,0 +1,8 @@ +x Nullable(String) +y Nullable(Float64) +x Nullable(String) +y Nullable(Float64) +x Nullable(String) +y Nullable(Float64) +x Nullable(String) +y Nullable(Float64) diff --git a/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql new file mode 100644 index 00000000000..af56856f0be --- /dev/null +++ b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.sql @@ -0,0 +1,14 @@ +-- Tags: no-fasttest, no-parallel + +insert into function file('test_02244', 'TSV', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1; +desc file('test_02244', 'TSV') settings column_names_for_schema_inference='x,y'; + +insert into function file('test_02244', 'CSV', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1; +desc file('test_02244', 'CSV') settings column_names_for_schema_inference='x,y'; + +insert into function file('test_02244', 'JSONCompactEachRow', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1; +desc file('test_02244', 'JSONCompactEachRow') settings column_names_for_schema_inference='x,y'; + +insert into function file('test_02244', 'Values', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1; +desc file('test_02244', 'Values') settings column_names_for_schema_inference='x,y'; + diff --git a/tests/queries/0_stateless/02245_parquet_skip_unknown_type.reference b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.reference new file mode 100644 index 00000000000..4f9cde534f0 --- /dev/null +++ b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.reference @@ -0,0 +1,16 @@ +OK +image_path Nullable(String) +caption Nullable(String) +NSFW Nullable(String) +similarity Nullable(Float64) +LICENSE Nullable(String) +url Nullable(String) +key Nullable(Int64) +shard_id Nullable(Int64) +status Nullable(String) +width Nullable(Int64) +height Nullable(Int64) +exif Nullable(String) +original_width Nullable(Int64) +original_height Nullable(Int64) +10 diff --git a/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh new file mode 100755 index 00000000000..005c089e434 --- /dev/null +++ b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02245.parquet +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +cp $CUR_DIR/data_parquet_bad_column/metadata_0.parquet $DATA_FILE + + +$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet)" 2>&1 | grep -qF "CANNOT_EXTRACT_TABLE_STRUCTURE" && echo "OK" || echo "FAIL" +$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1" +$CLICKHOUSE_CLIENT -q "select count(*) from file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1" + diff --git a/tests/queries/0_stateless/02245_s3_schema_desc.reference b/tests/queries/0_stateless/02245_s3_schema_desc.reference index a5b0f81a2c7..e039680d933 100644 --- a/tests/queries/0_stateless/02245_s3_schema_desc.reference +++ b/tests/queries/0_stateless/02245_s3_schema_desc.reference @@ -1,21 +1,21 @@ -c1 Nullable(String) -c2 Nullable(String) -c3 Nullable(String) -c1 Nullable(String) -c2 Nullable(String) -c3 Nullable(String) +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(Float64) +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(Float64) c1 UInt64 c2 UInt64 c3 UInt64 -c1 Nullable(String) -c2 Nullable(String) -c3 Nullable(String) +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(Float64) c1 UInt64 c2 UInt64 c3 UInt64 -c1 Nullable(String) -c2 Nullable(String) -c3 Nullable(String) +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(Float64) c1 UInt64 c2 UInt64 c3 UInt64 diff --git a/tests/queries/0_stateless/02245_s3_schema_desc.sql b/tests/queries/0_stateless/02245_s3_schema_desc.sql index 4ab870e1379..2cd362ff233 100644 --- a/tests/queries/0_stateless/02245_s3_schema_desc.sql +++ b/tests/queries/0_stateless/02245_s3_schema_desc.sql @@ -10,4 +10,5 @@ desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64'); desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64', 'auto'); + SELECT * FROM s3(decodeURLComponent(NULL), [NULL]); --{serverError 170} diff --git a/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference new file mode 100644 index 00000000000..c245f13fdbe --- /dev/null +++ b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference @@ -0,0 +1,107 @@ +TSV +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Nullable(Float64)) +c4 Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)) +42 Some string [1,2,3,4] (1,2,3) +42 abcd [] (4,5,6) +c1 Nullable(String) +[({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)] +[] +[({}, [], 0)] +[({}, [NULL], NULL)] +[({}, [\'String3\'], NULL)] +[({\'key3\': NULL}, []), NULL] +c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64))) +[({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)] +[] +[({},[],0)] +[({},[NULL],NULL)] +[({},['String3'],NULL)] +[({'key3':NULL},[],NULL)] +c1 Nullable(Bool) +true +false +\N +c1 Array(Nullable(Bool)) +[true,NULL] +[] +[NULL] +[false] +c1 Nullable(String) +[] +c1 Nullable(String) +{} +c1 Nullable(String) +() +c1 Nullable(String) +[1, 2, 3 +c1 Nullable(String) +[(1, 2, 3 4)] +c1 Nullable(String) +[1, 2, 3 + 4] +c1 Nullable(String) +(1, 2, +c1 Nullable(String) +[1, Some trash, 42.2] +c1 Nullable(String) +[1, \'String\', {\'key\' : 2}] +c1 Nullable(String) +{\'key\' : 1, [1] : 10} +c1 Nullable(String) +{}{} +c1 Nullable(String) +[1, 2, 3 +c1 Nullable(String) +[abc, def] +c1 Array(Nullable(String)) +['abc','def'] +c1 Nullable(String) +[\'string] +c1 Nullable(String) +\'string +c1 Nullable(Float64) +42.42 +c1 Nullable(String) +42.42sometrash +c1 Nullable(String) +[42.42sometrash, 42.42] + +CSV +c1 Nullable(String) +c2 Nullable(String) +c3 Array(Nullable(Float64)) +c4 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))) +42 Some string [1,2,3,4] [(1,2,3)] +42\\ abcd [] [(4,5,6)] +c1 Nullable(String) +[({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)] +[] +[({}, [], 0)] +[({}, [NULL], NULL)] +[({}, [\'String3\'], NULL)] +[({\'key3\': NULL}, []), NULL] +c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64))) +[({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)] +[] +[({},[],0)] +[({},[NULL],NULL)] +[({},['String3'],NULL)] +[({'key3':NULL},[],NULL)] +c1 Nullable(Bool) +true +false +\N +c1 Array(Nullable(Bool)) +[true,NULL] +[] +[NULL] +[false] +c1 Nullable(String) +(1, 2, 3) +c1 Nullable(String) +123.123 +c1 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))) +[(1,2,3)] +c1 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))) +[(1,2,3)] diff --git a/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.sh b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.sh new file mode 100755 index 00000000000..6589765f739 --- /dev/null +++ b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.sh @@ -0,0 +1,220 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME + +touch $DATA_FILE + +echo "TSV" + +echo -e "42\tSome string\t[1, 2, 3, 4]\t(1, 2, 3) +42\tabcd\t[]\t(4, 5, 6)" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)] +[] +[({}, [], 0)] +[({}, [NULL], NULL)] +[({}, ['String3'], NULL)] +[({'key3': NULL}, []), NULL]"> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV') settings input_format_tsv_use_best_effort_in_schema_inference=false" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV') settings input_format_tsv_use_best_effort_in_schema_inference=false" + + +echo -e "[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)] +[] +[({}, [], 0)] +[({}, [NULL], NULL)] +[({}, ['String3'], NULL)] +[({'key3': NULL}, [], NULL)]"> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "true +false +\N" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[true, NULL] +[] +[NULL] +[false]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "{}" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "()" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[1, 2, 3" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[(1, 2, 3 4)]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[1, 2, 3 + 4]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "(1, 2," > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[1, Some trash, 42.2]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[1, 'String', {'key' : 2}]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "{'key' : 1, [1] : 10}" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "{}{}" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[1, 2, 3" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[abc, def]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "['abc', 'def']" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "['string]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "'string" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "42.42" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "42.42sometrash" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo -e "[42.42sometrash, 42.42]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + + +echo +echo "CSV" + +echo -e "42,Some string,'[1, 2, 3, 4]','[(1, 2, 3)]' +42\,abcd,'[]','[(4, 5, 6)]'" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "\"[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)]\" +'[]' +'[({}, [], 0)]' +'[({}, [NULL], NULL)]' +\"[({}, ['String3'], NULL)]\" +\"[({'key3': NULL}, []), NULL]\""> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV') settings input_format_csv_use_best_effort_in_schema_inference=false" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV') settings input_format_csv_use_best_effort_in_schema_inference=false" + +echo -e "\"[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)]\" +'[]' +'[({}, [], 0)]' +'[({}, [NULL], NULL)]' +\"[({}, ['String3'], NULL)]\" +\"[({'key3': NULL}, [], NULL)]\""> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "true +false +\N" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "'[true, NULL]' +'[]' +'[NULL]' +'[false]'" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + + +echo -e "'(1, 2, 3)'"> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "'123.123'"> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "'[(1, 2, 3)]'"> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "\"[(1, 2, 3)]\""> $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + + diff --git a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference index 49a285dc11a..300846c17a0 100644 --- a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference +++ b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference @@ -1,15 +1,15 @@ -a Nullable(String) +a Nullable(Float64) b Nullable(String) -c Nullable(String) -1 s1 \N +c Array(Nullable(Float64)) +1 s1 [] 2 } [2] -\N \N \N -\N \N \N +\N \N [] +\N \N [] \N \N [3] -b Nullable(String) -a Nullable(String) -c Nullable(String) -e Nullable(String) +b Nullable(Float64) +a Nullable(Float64) +c Nullable(Float64) +e Nullable(Float64) 1 \N \N \N \N 2 3 \N \N \N \N \N diff --git a/utils/check-style/codespell-ignore-words.list b/utils/check-style/codespell-ignore-words.list index d3a7586647c..7aabaff17c5 100644 --- a/utils/check-style/codespell-ignore-words.list +++ b/utils/check-style/codespell-ignore-words.list @@ -10,3 +10,4 @@ ths offsett numer ue +alse