From a0369fb9a6be015e0ce4fee405419d8531cdd54d Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 18 May 2022 14:51:21 +0000 Subject: [PATCH] Allow to use String type instead of Binary in Arrow/Parquet/ORC formats --- src/Core/Settings.h | 4 + src/Formats/FormatFactory.cpp | 7 +- src/Formats/FormatSettings.h | 3 + .../Formats/Impl/ArrowBlockOutputFormat.cpp | 2 +- .../Formats/Impl/ArrowColumnToCHColumn.cpp | 2 + .../Formats/Impl/CHColumnToArrowColumn.cpp | 75 +++++++++++-------- .../Formats/Impl/CHColumnToArrowColumn.h | 6 +- .../Formats/Impl/ORCBlockOutputFormat.cpp | 2 + .../Formats/Impl/ParquetBlockOutputFormat.cpp | 2 +- ...c_arrow_parquet_string_as_string.reference | 3 + ...304_orc_arrow_parquet_string_as_string.sql | 6 ++ 11 files changed, 77 insertions(+), 35 deletions(-) create mode 100644 tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.reference create mode 100644 tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 29427c673ac..5f17b088813 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -698,6 +698,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, output_format_pretty_color, true, "Use ANSI escape sequences to paint colors in Pretty formats", 0) \ M(String, output_format_pretty_grid_charset, "UTF-8", "Charset for printing grid borders. Available charsets: ASCII, UTF-8 (default one).", 0) \ M(UInt64, output_format_parquet_row_group_size, 1000000, "Row group size in rows.", 0) \ + M(Bool, output_format_parquet_string_as_string, false, "Use Parquet String type instead of Binary for String columns.", 0) \ M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \ M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \ M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \ @@ -735,6 +736,9 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if possible. Possible values: 0 - no rewrite, 1 - apply if possible, 2 - force rewrite all cross joins", 0) \ \ M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \ + M(Bool, output_format_arrow_string_as_string, false, "Use Arrow String type instead of Binary for String columns", 0) \ + \ + M(Bool, output_format_orc_string_as_string, false, "Use ORC String type instead of Binary for String columns", 0) \ \ M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 96b52cd2423..4c1b23a75ab 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -99,6 +99,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching; format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns; format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string; format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8; format_settings.pretty.color = settings.output_format_pretty_color; format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width; @@ -132,17 +133,19 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.arrow.import_nested = settings.input_format_arrow_import_nested; format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns; format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching; + format_settings.arrow.output_string_as_string = settings.output_format_arrow_string_as_string; format_settings.orc.import_nested = settings.input_format_orc_import_nested; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; - format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference; - format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching; format_settings.orc.import_nested = settings.input_format_orc_import_nested; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching; + format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string; format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.seekable_read = settings.input_format_allow_seeks; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 4f77fe099e1..e6f0a7d229e 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -81,6 +81,7 @@ struct FormatSettings bool allow_missing_columns = false; bool skip_columns_with_unsupported_types_in_schema_inference = false; bool case_insensitive_column_matching = false; + bool output_string_as_string = false; } arrow; struct @@ -148,6 +149,7 @@ struct FormatSettings bool skip_columns_with_unsupported_types_in_schema_inference = false; bool case_insensitive_column_matching = false; std::unordered_set skip_row_groups = {}; + bool output_string_as_string = false; } parquet; struct Pretty @@ -234,6 +236,7 @@ struct FormatSettings bool skip_columns_with_unsupported_types_in_schema_inference = false; bool case_insensitive_column_matching = false; std::unordered_set skip_stripes = {}; + bool output_string_as_string = false; } orc; /// For capnProto format we should determine how to diff --git a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp index 60408f13ff0..83eaefa8cf7 100644 --- a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp @@ -34,7 +34,7 @@ void ArrowBlockOutputFormat::consume(Chunk chunk) { const Block & header = getPort(PortKind::Main).getHeader(); ch_column_to_arrow_column - = std::make_unique(header, "Arrow", format_settings.arrow.low_cardinality_as_dictionary); + = std::make_unique(header, "Arrow", format_settings.arrow.low_cardinality_as_dictionary, format_settings.arrow.output_string_as_string); } ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num); diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index c792d828e44..543d09a48d3 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -36,6 +36,8 @@ #include #include +#include + /// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn. #define FOR_ARROW_NUMERIC_TYPES(M) \ M(arrow::Type::UINT8, DB::UInt8) \ diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp index 3f6a36e8e8c..bd5a6368291 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp @@ -168,6 +168,7 @@ namespace DB String format_name, size_t start, size_t end, + bool output_string_as_string, std::unordered_map> & dictionary_values); template @@ -180,6 +181,7 @@ namespace DB String format_name, size_t start, size_t end, + bool output_string_as_string, std::unordered_map> & dictionary_values) { const auto * column_array = assert_cast(column.get()); @@ -196,7 +198,7 @@ namespace DB /// Start new array. components_status = builder.Append(); checkStatus(components_status, nested_column->getName(), format_name); - fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx], dictionary_values); + fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx], output_string_as_string, dictionary_values); } } @@ -209,6 +211,7 @@ namespace DB String format_name, size_t start, size_t end, + bool output_string_as_string, std::unordered_map> & dictionary_values) { const auto * column_tuple = assert_cast(column.get()); @@ -219,7 +222,7 @@ namespace DB for (size_t i = 0; i != column_tuple->tupleSize(); ++i) { ColumnPtr nested_column = column_tuple->getColumnPtr(i); - fillArrowArray(column_name + "." + std::to_string(i), nested_column, nested_types[i], null_bytemap, builder.field_builder(i), format_name, start, end, dictionary_values); + fillArrowArray(column_name + "." + std::to_string(i), nested_column, nested_types[i], null_bytemap, builder.field_builder(i), format_name, start, end, output_string_as_string, dictionary_values); } for (size_t i = start; i != end; ++i) @@ -267,6 +270,7 @@ namespace DB String format_name, size_t start, size_t end, + bool output_string_as_string, std::unordered_map> & dictionary_values) { const auto * column_lc = assert_cast(column.get()); @@ -284,7 +288,7 @@ namespace DB auto dict_column = column_lc->getDictionary().getNestedColumn(); const auto & dict_type = assert_cast(column_type.get())->getDictionaryType(); - fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), dictionary_values); + fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), output_string_as_string, dictionary_values); status = values_builder->Finish(&dict_values); checkStatus(status, column->getName(), format_name); } @@ -321,6 +325,7 @@ namespace DB String format_name, size_t start, size_t end, + bool output_string_as_string, std::unordered_map> & dictionary_values) { auto value_type = assert_cast(array_builder->type().get())->value_type(); @@ -328,7 +333,7 @@ namespace DB #define DISPATCH(ARROW_TYPE_ID, ARROW_TYPE) \ if (arrow::Type::ARROW_TYPE_ID == value_type->id()) \ { \ - fillArrowArrayWithLowCardinalityColumnDataImpl(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); \ + fillArrowArrayWithLowCardinalityColumnDataImpl(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); \ return; \ } @@ -338,7 +343,7 @@ namespace DB throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot fill arrow array with {} data.", column_type->getName()); } - template + template static void fillArrowArrayWithStringColumnData( ColumnPtr write_column, const PaddedPODArray * null_bytemap, @@ -348,7 +353,7 @@ namespace DB size_t end) { const auto & internal_column = assert_cast(*write_column); - arrow::BinaryBuilder & builder = assert_cast(*array_builder); + ArrowBuilder & builder = assert_cast(*array_builder); arrow::Status status; for (size_t string_i = start; string_i < end; ++string_i) @@ -442,6 +447,7 @@ namespace DB String format_name, size_t start, size_t end, + bool output_string_as_string, std::unordered_map> & dictionary_values) { const String column_type_name = column_type->getFamilyName(); @@ -453,15 +459,21 @@ namespace DB DataTypePtr nested_type = assert_cast(column_type.get())->getNestedType(); const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr(); const PaddedPODArray & bytemap = assert_cast &>(*null_column).getData(); - fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, dictionary_values); + fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); } else if (isString(column_type)) { - fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); + if (output_string_as_string) + fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); + else + fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); } else if (isFixedString(column_type)) { - fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); + if (output_string_as_string) + fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); + else + fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); } else if (isDate(column_type)) { @@ -477,21 +489,21 @@ namespace DB } else if (isArray(column_type)) { - fillArrowArrayWithArrayColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); + fillArrowArrayWithArrayColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); } else if (isTuple(column_type)) { - fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); + fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); } else if (column_type->getTypeId() == TypeIndex::LowCardinality) { - fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); + fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); } else if (isMap(column_type)) { ColumnPtr column_array = assert_cast(column.get())->getNestedColumnPtr(); DataTypePtr array_type = assert_cast(column_type.get())->getNestedType(); - fillArrowArrayWithArrayColumnData(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); + fillArrowArrayWithArrayColumnData(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); } else if (isDecimal(column_type)) { @@ -603,13 +615,13 @@ namespace DB } static std::shared_ptr getArrowType( - DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, bool * out_is_column_nullable) + DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, bool output_string_as_string, bool * out_is_column_nullable) { if (column_type->isNullable()) { DataTypePtr nested_type = assert_cast(column_type.get())->getNestedType(); ColumnPtr nested_column = assert_cast(column.get())->getNestedColumnPtr(); - auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable); + auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable); *out_is_column_nullable = true; return arrow_type; } @@ -643,7 +655,7 @@ namespace DB { auto nested_type = assert_cast(column_type.get())->getNestedType(); auto nested_column = assert_cast(column.get())->getDataPtr(); - auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable); + auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable); return arrow::list(nested_arrow_type); } @@ -655,7 +667,7 @@ namespace DB for (size_t i = 0; i != nested_types.size(); ++i) { String name = column_name + "." + std::to_string(i); - auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), name, format_name, out_is_column_nullable); + auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), name, format_name, output_string_as_string, out_is_column_nullable); nested_fields.push_back(std::make_shared(name, nested_arrow_type, *out_is_column_nullable)); } return arrow::struct_(nested_fields); @@ -669,7 +681,7 @@ namespace DB const auto & indexes_column = lc_column->getIndexesPtr(); return arrow::dictionary( getArrowTypeForLowCardinalityIndexes(indexes_column), - getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable)); + getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable)); } if (isMap(column_type)) @@ -680,10 +692,19 @@ namespace DB const auto & columns = assert_cast(column.get())->getNestedData().getColumns(); return arrow::map( - getArrowType(key_type, columns[0], column_name, format_name, out_is_column_nullable), - getArrowType(val_type, columns[1], column_name, format_name, out_is_column_nullable)); + getArrowType(key_type, columns[0], column_name, format_name, output_string_as_string, out_is_column_nullable), + getArrowType(val_type, columns[1], column_name, format_name, output_string_as_string, out_is_column_nullable)); } + if (isDateTime64(column_type)) + { + const auto * datetime64_type = assert_cast(column_type.get()); + return arrow::timestamp(getArrowTimeUnit(datetime64_type), datetime64_type->getTimeZone().getTimeZone()); + } + + if (isStringOrFixedString(column_type) && output_string_as_string) + return arrow::utf8(); + const std::string type_name = column_type->getFamilyName(); if (const auto * arrow_type_it = std::find_if( internal_type_to_arrow_type.begin(), @@ -694,19 +715,13 @@ namespace DB return arrow_type_it->second; } - if (isDateTime64(column_type)) - { - const auto * datetime64_type = assert_cast(column_type.get()); - return arrow::timestamp(getArrowTimeUnit(datetime64_type), datetime64_type->getTimeZone().getTimeZone()); - } - throw Exception(ErrorCodes::UNKNOWN_TYPE, "The type '{}' of a column '{}' is not supported for conversion into {} data format.", column_type->getName(), column_name, format_name); } - CHColumnToArrowColumn::CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_) - : format_name(format_name_), low_cardinality_as_dictionary(low_cardinality_as_dictionary_) + CHColumnToArrowColumn::CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_, bool output_string_as_string_) + : format_name(format_name_), low_cardinality_as_dictionary(low_cardinality_as_dictionary_), output_string_as_string(output_string_as_string_) { arrow_fields.reserve(header.columns()); header_columns.reserve(header.columns()); @@ -741,7 +756,7 @@ namespace DB if (!is_arrow_fields_initialized) { bool is_column_nullable = false; - auto arrow_type = getArrowType(header_column.type, column, header_column.name, format_name, &is_column_nullable); + auto arrow_type = getArrowType(header_column.type, column, header_column.name, format_name, output_string_as_string, &is_column_nullable); arrow_fields.emplace_back(std::make_shared(header_column.name, arrow_type, is_column_nullable)); } @@ -751,7 +766,7 @@ namespace DB checkStatus(status, column->getName(), format_name); fillArrowArray( - header_column.name, column, header_column.type, nullptr, array_builder.get(), format_name, 0, column->size(), dictionary_values); + header_column.name, column, header_column.type, nullptr, array_builder.get(), format_name, 0, column->size(), output_string_as_string, dictionary_values); std::shared_ptr arrow_array; status = array_builder->Finish(&arrow_array); diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.h b/src/Processors/Formats/Impl/CHColumnToArrowColumn.h index 50de8045d5f..2896fb3642f 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.h +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.h @@ -14,7 +14,7 @@ namespace DB class CHColumnToArrowColumn { public: - CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_); + CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_, bool output_string_as_string_); void chChunkToArrowTable(std::shared_ptr & res, const Chunk & chunk, size_t columns_num); @@ -32,6 +32,10 @@ private: /// because LowCardinality column from header always has indexes type UInt8, so, we should get /// proper indexes type from first chunk of data. bool is_arrow_fields_initialized = false; + + /// Output columns with String data type as Arrow::String type. + /// By default Arrow::Binary is used. + bool output_string_as_string = false; }; } diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 106b71a9df5..aaa3e8fe976 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -100,6 +100,8 @@ ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & t case TypeIndex::FixedString: [[fallthrough]]; case TypeIndex::String: { + if (format_settings.orc.output_string_as_string) + return orc::createPrimitiveType(orc::TypeKind::STRING); return orc::createPrimitiveType(orc::TypeKind::BINARY); } case TypeIndex::Nullable: diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index 68e2ae1c6eb..c8e94311af5 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -29,7 +29,7 @@ void ParquetBlockOutputFormat::consume(Chunk chunk) if (!ch_column_to_arrow_column) { const Block & header = getPort(PortKind::Main).getHeader(); - ch_column_to_arrow_column = std::make_unique(header, "Parquet", false); + ch_column_to_arrow_column = std::make_unique(header, "Parquet", false, format_settings.parquet.output_string_as_string); } ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num); diff --git a/tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.reference b/tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.reference new file mode 100644 index 00000000000..f0ab418f0ce --- /dev/null +++ b/tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.reference @@ -0,0 +1,3 @@ +s Nullable(String) +s Nullable(String) +s Nullable(String) diff --git a/tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.sql b/tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.sql new file mode 100644 index 00000000000..37ebc1c748e --- /dev/null +++ b/tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.sql @@ -0,0 +1,6 @@ +insert into function file(data_02304.parquet) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_parquet_string_as_string=1; +desc file(data_02304.parquet); +insert into function file(data_02304.orc) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_orc_string_as_string=1; +desc file(data_02304.orc); +insert into function file(data_02304.arrow) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_arrow_string_as_string=1; +desc file(data_02304.arrow);