Allow to use String type instead of Binary in Arrow/Parquet/ORC formats

This commit is contained in:
avogar 2022-05-18 14:51:21 +00:00
parent a19d4c6f1f
commit a0369fb9a6
11 changed files with 77 additions and 35 deletions

View File

@ -698,6 +698,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
M(Bool, output_format_pretty_color, true, "Use ANSI escape sequences to paint colors in Pretty formats", 0) \ M(Bool, output_format_pretty_color, true, "Use ANSI escape sequences to paint colors in Pretty formats", 0) \
M(String, output_format_pretty_grid_charset, "UTF-8", "Charset for printing grid borders. Available charsets: ASCII, UTF-8 (default one).", 0) \ M(String, output_format_pretty_grid_charset, "UTF-8", "Charset for printing grid borders. Available charsets: ASCII, UTF-8 (default one).", 0) \
M(UInt64, output_format_parquet_row_group_size, 1000000, "Row group size in rows.", 0) \ M(UInt64, output_format_parquet_row_group_size, 1000000, "Row group size in rows.", 0) \
M(Bool, output_format_parquet_string_as_string, false, "Use Parquet String type instead of Binary for String columns.", 0) \
M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \ M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \
M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \ M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \
M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \ M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \
@ -735,6 +736,9 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if possible. Possible values: 0 - no rewrite, 1 - apply if possible, 2 - force rewrite all cross joins", 0) \ M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if possible. Possible values: 0 - no rewrite, 1 - apply if possible, 2 - force rewrite all cross joins", 0) \
\ \
M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \ M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \
M(Bool, output_format_arrow_string_as_string, false, "Use Arrow String type instead of Binary for String columns", 0) \
\
M(Bool, output_format_orc_string_as_string, false, "Use ORC String type instead of Binary for String columns", 0) \
\ \
M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \
\ \

View File

@ -99,6 +99,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching; format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching;
format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns; format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns;
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference; format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string;
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8; format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
format_settings.pretty.color = settings.output_format_pretty_color; format_settings.pretty.color = settings.output_format_pretty_color;
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width; format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
@ -132,17 +133,19 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.arrow.import_nested = settings.input_format_arrow_import_nested; format_settings.arrow.import_nested = settings.input_format_arrow_import_nested;
format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns; format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns;
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference; format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching;
format_settings.arrow.output_string_as_string = settings.output_format_arrow_string_as_string;
format_settings.orc.import_nested = settings.input_format_orc_import_nested; format_settings.orc.import_nested = settings.input_format_orc_import_nested;
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching;
format_settings.orc.import_nested = settings.input_format_orc_import_nested; format_settings.orc.import_nested = settings.input_format_orc_import_nested;
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching; format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string;
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
format_settings.seekable_read = settings.input_format_allow_seeks; format_settings.seekable_read = settings.input_format_allow_seeks;

View File

@ -81,6 +81,7 @@ struct FormatSettings
bool allow_missing_columns = false; bool allow_missing_columns = false;
bool skip_columns_with_unsupported_types_in_schema_inference = false; bool skip_columns_with_unsupported_types_in_schema_inference = false;
bool case_insensitive_column_matching = false; bool case_insensitive_column_matching = false;
bool output_string_as_string = false;
} arrow; } arrow;
struct struct
@ -148,6 +149,7 @@ struct FormatSettings
bool skip_columns_with_unsupported_types_in_schema_inference = false; bool skip_columns_with_unsupported_types_in_schema_inference = false;
bool case_insensitive_column_matching = false; bool case_insensitive_column_matching = false;
std::unordered_set<int> skip_row_groups = {}; std::unordered_set<int> skip_row_groups = {};
bool output_string_as_string = false;
} parquet; } parquet;
struct Pretty struct Pretty
@ -234,6 +236,7 @@ struct FormatSettings
bool skip_columns_with_unsupported_types_in_schema_inference = false; bool skip_columns_with_unsupported_types_in_schema_inference = false;
bool case_insensitive_column_matching = false; bool case_insensitive_column_matching = false;
std::unordered_set<int> skip_stripes = {}; std::unordered_set<int> skip_stripes = {};
bool output_string_as_string = false;
} orc; } orc;
/// For capnProto format we should determine how to /// For capnProto format we should determine how to

View File

@ -34,7 +34,7 @@ void ArrowBlockOutputFormat::consume(Chunk chunk)
{ {
const Block & header = getPort(PortKind::Main).getHeader(); const Block & header = getPort(PortKind::Main).getHeader();
ch_column_to_arrow_column ch_column_to_arrow_column
= std::make_unique<CHColumnToArrowColumn>(header, "Arrow", format_settings.arrow.low_cardinality_as_dictionary); = std::make_unique<CHColumnToArrowColumn>(header, "Arrow", format_settings.arrow.low_cardinality_as_dictionary, format_settings.arrow.output_string_as_string);
} }
ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num); ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num);

View File

@ -36,6 +36,8 @@
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/case_conv.hpp> #include <boost/algorithm/string/case_conv.hpp>
#include <Common/logger_useful.h>
/// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn. /// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn.
#define FOR_ARROW_NUMERIC_TYPES(M) \ #define FOR_ARROW_NUMERIC_TYPES(M) \
M(arrow::Type::UINT8, DB::UInt8) \ M(arrow::Type::UINT8, DB::UInt8) \

View File

@ -168,6 +168,7 @@ namespace DB
String format_name, String format_name,
size_t start, size_t start,
size_t end, size_t end,
bool output_string_as_string,
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values); std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values);
template <typename Builder> template <typename Builder>
@ -180,6 +181,7 @@ namespace DB
String format_name, String format_name,
size_t start, size_t start,
size_t end, size_t end,
bool output_string_as_string,
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values) std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
{ {
const auto * column_array = assert_cast<const ColumnArray *>(column.get()); const auto * column_array = assert_cast<const ColumnArray *>(column.get());
@ -196,7 +198,7 @@ namespace DB
/// Start new array. /// Start new array.
components_status = builder.Append(); components_status = builder.Append();
checkStatus(components_status, nested_column->getName(), format_name); checkStatus(components_status, nested_column->getName(), format_name);
fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx], dictionary_values); fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx], output_string_as_string, dictionary_values);
} }
} }
@ -209,6 +211,7 @@ namespace DB
String format_name, String format_name,
size_t start, size_t start,
size_t end, size_t end,
bool output_string_as_string,
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values) std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
{ {
const auto * column_tuple = assert_cast<const ColumnTuple *>(column.get()); const auto * column_tuple = assert_cast<const ColumnTuple *>(column.get());
@ -219,7 +222,7 @@ namespace DB
for (size_t i = 0; i != column_tuple->tupleSize(); ++i) for (size_t i = 0; i != column_tuple->tupleSize(); ++i)
{ {
ColumnPtr nested_column = column_tuple->getColumnPtr(i); ColumnPtr nested_column = column_tuple->getColumnPtr(i);
fillArrowArray(column_name + "." + std::to_string(i), nested_column, nested_types[i], null_bytemap, builder.field_builder(i), format_name, start, end, dictionary_values); fillArrowArray(column_name + "." + std::to_string(i), nested_column, nested_types[i], null_bytemap, builder.field_builder(i), format_name, start, end, output_string_as_string, dictionary_values);
} }
for (size_t i = start; i != end; ++i) for (size_t i = start; i != end; ++i)
@ -267,6 +270,7 @@ namespace DB
String format_name, String format_name,
size_t start, size_t start,
size_t end, size_t end,
bool output_string_as_string,
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values) std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
{ {
const auto * column_lc = assert_cast<const ColumnLowCardinality *>(column.get()); const auto * column_lc = assert_cast<const ColumnLowCardinality *>(column.get());
@ -284,7 +288,7 @@ namespace DB
auto dict_column = column_lc->getDictionary().getNestedColumn(); auto dict_column = column_lc->getDictionary().getNestedColumn();
const auto & dict_type = assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType(); const auto & dict_type = assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType();
fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), dictionary_values); fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), output_string_as_string, dictionary_values);
status = values_builder->Finish(&dict_values); status = values_builder->Finish(&dict_values);
checkStatus(status, column->getName(), format_name); checkStatus(status, column->getName(), format_name);
} }
@ -321,6 +325,7 @@ namespace DB
String format_name, String format_name,
size_t start, size_t start,
size_t end, size_t end,
bool output_string_as_string,
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values) std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
{ {
auto value_type = assert_cast<arrow::DictionaryType *>(array_builder->type().get())->value_type(); auto value_type = assert_cast<arrow::DictionaryType *>(array_builder->type().get())->value_type();
@ -328,7 +333,7 @@ namespace DB
#define DISPATCH(ARROW_TYPE_ID, ARROW_TYPE) \ #define DISPATCH(ARROW_TYPE_ID, ARROW_TYPE) \
if (arrow::Type::ARROW_TYPE_ID == value_type->id()) \ if (arrow::Type::ARROW_TYPE_ID == value_type->id()) \
{ \ { \
fillArrowArrayWithLowCardinalityColumnDataImpl<ARROW_TYPE>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); \ fillArrowArrayWithLowCardinalityColumnDataImpl<ARROW_TYPE>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); \
return; \ return; \
} }
@ -338,7 +343,7 @@ namespace DB
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot fill arrow array with {} data.", column_type->getName()); throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot fill arrow array with {} data.", column_type->getName());
} }
template <typename ColumnType> template <typename ColumnType, typename ArrowBuilder>
static void fillArrowArrayWithStringColumnData( static void fillArrowArrayWithStringColumnData(
ColumnPtr write_column, ColumnPtr write_column,
const PaddedPODArray<UInt8> * null_bytemap, const PaddedPODArray<UInt8> * null_bytemap,
@ -348,7 +353,7 @@ namespace DB
size_t end) size_t end)
{ {
const auto & internal_column = assert_cast<const ColumnType &>(*write_column); const auto & internal_column = assert_cast<const ColumnType &>(*write_column);
arrow::BinaryBuilder & builder = assert_cast<arrow::BinaryBuilder &>(*array_builder); ArrowBuilder & builder = assert_cast<ArrowBuilder &>(*array_builder);
arrow::Status status; arrow::Status status;
for (size_t string_i = start; string_i < end; ++string_i) for (size_t string_i = start; string_i < end; ++string_i)
@ -442,6 +447,7 @@ namespace DB
String format_name, String format_name,
size_t start, size_t start,
size_t end, size_t end,
bool output_string_as_string,
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values) std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
{ {
const String column_type_name = column_type->getFamilyName(); const String column_type_name = column_type->getFamilyName();
@ -453,15 +459,21 @@ namespace DB
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType(); DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr(); const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr();
const PaddedPODArray<UInt8> & bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData(); const PaddedPODArray<UInt8> & bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData();
fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, dictionary_values); fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
} }
else if (isString(column_type)) else if (isString(column_type))
{ {
fillArrowArrayWithStringColumnData<ColumnString>(column, null_bytemap, format_name, array_builder, start, end); if (output_string_as_string)
fillArrowArrayWithStringColumnData<ColumnString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
else
fillArrowArrayWithStringColumnData<ColumnString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
} }
else if (isFixedString(column_type)) else if (isFixedString(column_type))
{ {
fillArrowArrayWithStringColumnData<ColumnFixedString>(column, null_bytemap, format_name, array_builder, start, end); if (output_string_as_string)
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
else
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
} }
else if (isDate(column_type)) else if (isDate(column_type))
{ {
@ -477,21 +489,21 @@ namespace DB
} }
else if (isArray(column_type)) else if (isArray(column_type))
{ {
fillArrowArrayWithArrayColumnData<arrow::ListBuilder>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); fillArrowArrayWithArrayColumnData<arrow::ListBuilder>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
} }
else if (isTuple(column_type)) else if (isTuple(column_type))
{ {
fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
} }
else if (column_type->getTypeId() == TypeIndex::LowCardinality) else if (column_type->getTypeId() == TypeIndex::LowCardinality)
{ {
fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
} }
else if (isMap(column_type)) else if (isMap(column_type))
{ {
ColumnPtr column_array = assert_cast<const ColumnMap *>(column.get())->getNestedColumnPtr(); ColumnPtr column_array = assert_cast<const ColumnMap *>(column.get())->getNestedColumnPtr();
DataTypePtr array_type = assert_cast<const DataTypeMap *>(column_type.get())->getNestedType(); DataTypePtr array_type = assert_cast<const DataTypeMap *>(column_type.get())->getNestedType();
fillArrowArrayWithArrayColumnData<arrow::MapBuilder>(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); fillArrowArrayWithArrayColumnData<arrow::MapBuilder>(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
} }
else if (isDecimal(column_type)) else if (isDecimal(column_type))
{ {
@ -603,13 +615,13 @@ namespace DB
} }
static std::shared_ptr<arrow::DataType> getArrowType( static std::shared_ptr<arrow::DataType> getArrowType(
DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, bool * out_is_column_nullable) DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, bool output_string_as_string, bool * out_is_column_nullable)
{ {
if (column_type->isNullable()) if (column_type->isNullable())
{ {
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType(); DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
ColumnPtr nested_column = assert_cast<const ColumnNullable *>(column.get())->getNestedColumnPtr(); ColumnPtr nested_column = assert_cast<const ColumnNullable *>(column.get())->getNestedColumnPtr();
auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable); auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable);
*out_is_column_nullable = true; *out_is_column_nullable = true;
return arrow_type; return arrow_type;
} }
@ -643,7 +655,7 @@ namespace DB
{ {
auto nested_type = assert_cast<const DataTypeArray *>(column_type.get())->getNestedType(); auto nested_type = assert_cast<const DataTypeArray *>(column_type.get())->getNestedType();
auto nested_column = assert_cast<const ColumnArray *>(column.get())->getDataPtr(); auto nested_column = assert_cast<const ColumnArray *>(column.get())->getDataPtr();
auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable); auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable);
return arrow::list(nested_arrow_type); return arrow::list(nested_arrow_type);
} }
@ -655,7 +667,7 @@ namespace DB
for (size_t i = 0; i != nested_types.size(); ++i) for (size_t i = 0; i != nested_types.size(); ++i)
{ {
String name = column_name + "." + std::to_string(i); String name = column_name + "." + std::to_string(i);
auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), name, format_name, out_is_column_nullable); auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), name, format_name, output_string_as_string, out_is_column_nullable);
nested_fields.push_back(std::make_shared<arrow::Field>(name, nested_arrow_type, *out_is_column_nullable)); nested_fields.push_back(std::make_shared<arrow::Field>(name, nested_arrow_type, *out_is_column_nullable));
} }
return arrow::struct_(nested_fields); return arrow::struct_(nested_fields);
@ -669,7 +681,7 @@ namespace DB
const auto & indexes_column = lc_column->getIndexesPtr(); const auto & indexes_column = lc_column->getIndexesPtr();
return arrow::dictionary( return arrow::dictionary(
getArrowTypeForLowCardinalityIndexes(indexes_column), getArrowTypeForLowCardinalityIndexes(indexes_column),
getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable)); getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable));
} }
if (isMap(column_type)) if (isMap(column_type))
@ -680,10 +692,19 @@ namespace DB
const auto & columns = assert_cast<const ColumnMap *>(column.get())->getNestedData().getColumns(); const auto & columns = assert_cast<const ColumnMap *>(column.get())->getNestedData().getColumns();
return arrow::map( return arrow::map(
getArrowType(key_type, columns[0], column_name, format_name, out_is_column_nullable), getArrowType(key_type, columns[0], column_name, format_name, output_string_as_string, out_is_column_nullable),
getArrowType(val_type, columns[1], column_name, format_name, out_is_column_nullable)); getArrowType(val_type, columns[1], column_name, format_name, output_string_as_string, out_is_column_nullable));
} }
if (isDateTime64(column_type))
{
const auto * datetime64_type = assert_cast<const DataTypeDateTime64 *>(column_type.get());
return arrow::timestamp(getArrowTimeUnit(datetime64_type), datetime64_type->getTimeZone().getTimeZone());
}
if (isStringOrFixedString(column_type) && output_string_as_string)
return arrow::utf8();
const std::string type_name = column_type->getFamilyName(); const std::string type_name = column_type->getFamilyName();
if (const auto * arrow_type_it = std::find_if( if (const auto * arrow_type_it = std::find_if(
internal_type_to_arrow_type.begin(), internal_type_to_arrow_type.begin(),
@ -694,19 +715,13 @@ namespace DB
return arrow_type_it->second; return arrow_type_it->second;
} }
if (isDateTime64(column_type))
{
const auto * datetime64_type = assert_cast<const DataTypeDateTime64 *>(column_type.get());
return arrow::timestamp(getArrowTimeUnit(datetime64_type), datetime64_type->getTimeZone().getTimeZone());
}
throw Exception(ErrorCodes::UNKNOWN_TYPE, throw Exception(ErrorCodes::UNKNOWN_TYPE,
"The type '{}' of a column '{}' is not supported for conversion into {} data format.", "The type '{}' of a column '{}' is not supported for conversion into {} data format.",
column_type->getName(), column_name, format_name); column_type->getName(), column_name, format_name);
} }
CHColumnToArrowColumn::CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_) CHColumnToArrowColumn::CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_, bool output_string_as_string_)
: format_name(format_name_), low_cardinality_as_dictionary(low_cardinality_as_dictionary_) : format_name(format_name_), low_cardinality_as_dictionary(low_cardinality_as_dictionary_), output_string_as_string(output_string_as_string_)
{ {
arrow_fields.reserve(header.columns()); arrow_fields.reserve(header.columns());
header_columns.reserve(header.columns()); header_columns.reserve(header.columns());
@ -741,7 +756,7 @@ namespace DB
if (!is_arrow_fields_initialized) if (!is_arrow_fields_initialized)
{ {
bool is_column_nullable = false; bool is_column_nullable = false;
auto arrow_type = getArrowType(header_column.type, column, header_column.name, format_name, &is_column_nullable); auto arrow_type = getArrowType(header_column.type, column, header_column.name, format_name, output_string_as_string, &is_column_nullable);
arrow_fields.emplace_back(std::make_shared<arrow::Field>(header_column.name, arrow_type, is_column_nullable)); arrow_fields.emplace_back(std::make_shared<arrow::Field>(header_column.name, arrow_type, is_column_nullable));
} }
@ -751,7 +766,7 @@ namespace DB
checkStatus(status, column->getName(), format_name); checkStatus(status, column->getName(), format_name);
fillArrowArray( fillArrowArray(
header_column.name, column, header_column.type, nullptr, array_builder.get(), format_name, 0, column->size(), dictionary_values); header_column.name, column, header_column.type, nullptr, array_builder.get(), format_name, 0, column->size(), output_string_as_string, dictionary_values);
std::shared_ptr<arrow::Array> arrow_array; std::shared_ptr<arrow::Array> arrow_array;
status = array_builder->Finish(&arrow_array); status = array_builder->Finish(&arrow_array);

View File

@ -14,7 +14,7 @@ namespace DB
class CHColumnToArrowColumn class CHColumnToArrowColumn
{ {
public: public:
CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_); CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_, bool output_string_as_string_);
void chChunkToArrowTable(std::shared_ptr<arrow::Table> & res, const Chunk & chunk, size_t columns_num); void chChunkToArrowTable(std::shared_ptr<arrow::Table> & res, const Chunk & chunk, size_t columns_num);
@ -32,6 +32,10 @@ private:
/// because LowCardinality column from header always has indexes type UInt8, so, we should get /// because LowCardinality column from header always has indexes type UInt8, so, we should get
/// proper indexes type from first chunk of data. /// proper indexes type from first chunk of data.
bool is_arrow_fields_initialized = false; bool is_arrow_fields_initialized = false;
/// Output columns with String data type as Arrow::String type.
/// By default Arrow::Binary is used.
bool output_string_as_string = false;
}; };
} }

View File

@ -100,6 +100,8 @@ ORC_UNIQUE_PTR<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr & t
case TypeIndex::FixedString: [[fallthrough]]; case TypeIndex::FixedString: [[fallthrough]];
case TypeIndex::String: case TypeIndex::String:
{ {
if (format_settings.orc.output_string_as_string)
return orc::createPrimitiveType(orc::TypeKind::STRING);
return orc::createPrimitiveType(orc::TypeKind::BINARY); return orc::createPrimitiveType(orc::TypeKind::BINARY);
} }
case TypeIndex::Nullable: case TypeIndex::Nullable:

View File

@ -29,7 +29,7 @@ void ParquetBlockOutputFormat::consume(Chunk chunk)
if (!ch_column_to_arrow_column) if (!ch_column_to_arrow_column)
{ {
const Block & header = getPort(PortKind::Main).getHeader(); const Block & header = getPort(PortKind::Main).getHeader();
ch_column_to_arrow_column = std::make_unique<CHColumnToArrowColumn>(header, "Parquet", false); ch_column_to_arrow_column = std::make_unique<CHColumnToArrowColumn>(header, "Parquet", false, format_settings.parquet.output_string_as_string);
} }
ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num); ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num);

View File

@ -0,0 +1,3 @@
s Nullable(String)
s Nullable(String)
s Nullable(String)

View File

@ -0,0 +1,6 @@
insert into function file(data_02304.parquet) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_parquet_string_as_string=1;
desc file(data_02304.parquet);
insert into function file(data_02304.orc) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_orc_string_as_string=1;
desc file(data_02304.orc);
insert into function file(data_02304.arrow) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_arrow_string_as_string=1;
desc file(data_02304.arrow);