mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Allow to use String type instead of Binary in Arrow/Parquet/ORC formats
This commit is contained in:
parent
a19d4c6f1f
commit
a0369fb9a6
@ -698,6 +698,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
|
|||||||
M(Bool, output_format_pretty_color, true, "Use ANSI escape sequences to paint colors in Pretty formats", 0) \
|
M(Bool, output_format_pretty_color, true, "Use ANSI escape sequences to paint colors in Pretty formats", 0) \
|
||||||
M(String, output_format_pretty_grid_charset, "UTF-8", "Charset for printing grid borders. Available charsets: ASCII, UTF-8 (default one).", 0) \
|
M(String, output_format_pretty_grid_charset, "UTF-8", "Charset for printing grid borders. Available charsets: ASCII, UTF-8 (default one).", 0) \
|
||||||
M(UInt64, output_format_parquet_row_group_size, 1000000, "Row group size in rows.", 0) \
|
M(UInt64, output_format_parquet_row_group_size, 1000000, "Row group size in rows.", 0) \
|
||||||
|
M(Bool, output_format_parquet_string_as_string, false, "Use Parquet String type instead of Binary for String columns.", 0) \
|
||||||
M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \
|
M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \
|
||||||
M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \
|
M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \
|
||||||
M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \
|
M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \
|
||||||
@ -735,6 +736,9 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
|
|||||||
M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if possible. Possible values: 0 - no rewrite, 1 - apply if possible, 2 - force rewrite all cross joins", 0) \
|
M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if possible. Possible values: 0 - no rewrite, 1 - apply if possible, 2 - force rewrite all cross joins", 0) \
|
||||||
\
|
\
|
||||||
M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \
|
M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \
|
||||||
|
M(Bool, output_format_arrow_string_as_string, false, "Use Arrow String type instead of Binary for String columns", 0) \
|
||||||
|
\
|
||||||
|
M(Bool, output_format_orc_string_as_string, false, "Use ORC String type instead of Binary for String columns", 0) \
|
||||||
\
|
\
|
||||||
M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \
|
M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \
|
||||||
\
|
\
|
||||||
|
@ -99,6 +99,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
|||||||
format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching;
|
format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching;
|
||||||
format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns;
|
format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns;
|
||||||
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference;
|
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference;
|
||||||
|
format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string;
|
||||||
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
|
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
|
||||||
format_settings.pretty.color = settings.output_format_pretty_color;
|
format_settings.pretty.color = settings.output_format_pretty_color;
|
||||||
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
|
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
|
||||||
@ -132,17 +133,19 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
|||||||
format_settings.arrow.import_nested = settings.input_format_arrow_import_nested;
|
format_settings.arrow.import_nested = settings.input_format_arrow_import_nested;
|
||||||
format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns;
|
format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns;
|
||||||
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
|
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
|
||||||
|
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
|
||||||
|
format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching;
|
||||||
|
format_settings.arrow.output_string_as_string = settings.output_format_arrow_string_as_string;
|
||||||
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
|
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
|
||||||
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
|
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
|
||||||
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
|
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
|
||||||
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
|
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
|
||||||
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
|
|
||||||
format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching;
|
|
||||||
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
|
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
|
||||||
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
|
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
|
||||||
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
|
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
|
||||||
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
|
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
|
||||||
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
|
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
|
||||||
|
format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string;
|
||||||
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
|
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
|
||||||
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
|
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
|
||||||
format_settings.seekable_read = settings.input_format_allow_seeks;
|
format_settings.seekable_read = settings.input_format_allow_seeks;
|
||||||
|
@ -81,6 +81,7 @@ struct FormatSettings
|
|||||||
bool allow_missing_columns = false;
|
bool allow_missing_columns = false;
|
||||||
bool skip_columns_with_unsupported_types_in_schema_inference = false;
|
bool skip_columns_with_unsupported_types_in_schema_inference = false;
|
||||||
bool case_insensitive_column_matching = false;
|
bool case_insensitive_column_matching = false;
|
||||||
|
bool output_string_as_string = false;
|
||||||
} arrow;
|
} arrow;
|
||||||
|
|
||||||
struct
|
struct
|
||||||
@ -148,6 +149,7 @@ struct FormatSettings
|
|||||||
bool skip_columns_with_unsupported_types_in_schema_inference = false;
|
bool skip_columns_with_unsupported_types_in_schema_inference = false;
|
||||||
bool case_insensitive_column_matching = false;
|
bool case_insensitive_column_matching = false;
|
||||||
std::unordered_set<int> skip_row_groups = {};
|
std::unordered_set<int> skip_row_groups = {};
|
||||||
|
bool output_string_as_string = false;
|
||||||
} parquet;
|
} parquet;
|
||||||
|
|
||||||
struct Pretty
|
struct Pretty
|
||||||
@ -234,6 +236,7 @@ struct FormatSettings
|
|||||||
bool skip_columns_with_unsupported_types_in_schema_inference = false;
|
bool skip_columns_with_unsupported_types_in_schema_inference = false;
|
||||||
bool case_insensitive_column_matching = false;
|
bool case_insensitive_column_matching = false;
|
||||||
std::unordered_set<int> skip_stripes = {};
|
std::unordered_set<int> skip_stripes = {};
|
||||||
|
bool output_string_as_string = false;
|
||||||
} orc;
|
} orc;
|
||||||
|
|
||||||
/// For capnProto format we should determine how to
|
/// For capnProto format we should determine how to
|
||||||
|
@ -34,7 +34,7 @@ void ArrowBlockOutputFormat::consume(Chunk chunk)
|
|||||||
{
|
{
|
||||||
const Block & header = getPort(PortKind::Main).getHeader();
|
const Block & header = getPort(PortKind::Main).getHeader();
|
||||||
ch_column_to_arrow_column
|
ch_column_to_arrow_column
|
||||||
= std::make_unique<CHColumnToArrowColumn>(header, "Arrow", format_settings.arrow.low_cardinality_as_dictionary);
|
= std::make_unique<CHColumnToArrowColumn>(header, "Arrow", format_settings.arrow.low_cardinality_as_dictionary, format_settings.arrow.output_string_as_string);
|
||||||
}
|
}
|
||||||
|
|
||||||
ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num);
|
ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num);
|
||||||
|
@ -36,6 +36,8 @@
|
|||||||
#include <boost/algorithm/string.hpp>
|
#include <boost/algorithm/string.hpp>
|
||||||
#include <boost/algorithm/string/case_conv.hpp>
|
#include <boost/algorithm/string/case_conv.hpp>
|
||||||
|
|
||||||
|
#include <Common/logger_useful.h>
|
||||||
|
|
||||||
/// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn.
|
/// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn.
|
||||||
#define FOR_ARROW_NUMERIC_TYPES(M) \
|
#define FOR_ARROW_NUMERIC_TYPES(M) \
|
||||||
M(arrow::Type::UINT8, DB::UInt8) \
|
M(arrow::Type::UINT8, DB::UInt8) \
|
||||||
|
@ -168,6 +168,7 @@ namespace DB
|
|||||||
String format_name,
|
String format_name,
|
||||||
size_t start,
|
size_t start,
|
||||||
size_t end,
|
size_t end,
|
||||||
|
bool output_string_as_string,
|
||||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values);
|
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values);
|
||||||
|
|
||||||
template <typename Builder>
|
template <typename Builder>
|
||||||
@ -180,6 +181,7 @@ namespace DB
|
|||||||
String format_name,
|
String format_name,
|
||||||
size_t start,
|
size_t start,
|
||||||
size_t end,
|
size_t end,
|
||||||
|
bool output_string_as_string,
|
||||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
||||||
{
|
{
|
||||||
const auto * column_array = assert_cast<const ColumnArray *>(column.get());
|
const auto * column_array = assert_cast<const ColumnArray *>(column.get());
|
||||||
@ -196,7 +198,7 @@ namespace DB
|
|||||||
/// Start new array.
|
/// Start new array.
|
||||||
components_status = builder.Append();
|
components_status = builder.Append();
|
||||||
checkStatus(components_status, nested_column->getName(), format_name);
|
checkStatus(components_status, nested_column->getName(), format_name);
|
||||||
fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx], dictionary_values);
|
fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx], output_string_as_string, dictionary_values);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -209,6 +211,7 @@ namespace DB
|
|||||||
String format_name,
|
String format_name,
|
||||||
size_t start,
|
size_t start,
|
||||||
size_t end,
|
size_t end,
|
||||||
|
bool output_string_as_string,
|
||||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
||||||
{
|
{
|
||||||
const auto * column_tuple = assert_cast<const ColumnTuple *>(column.get());
|
const auto * column_tuple = assert_cast<const ColumnTuple *>(column.get());
|
||||||
@ -219,7 +222,7 @@ namespace DB
|
|||||||
for (size_t i = 0; i != column_tuple->tupleSize(); ++i)
|
for (size_t i = 0; i != column_tuple->tupleSize(); ++i)
|
||||||
{
|
{
|
||||||
ColumnPtr nested_column = column_tuple->getColumnPtr(i);
|
ColumnPtr nested_column = column_tuple->getColumnPtr(i);
|
||||||
fillArrowArray(column_name + "." + std::to_string(i), nested_column, nested_types[i], null_bytemap, builder.field_builder(i), format_name, start, end, dictionary_values);
|
fillArrowArray(column_name + "." + std::to_string(i), nested_column, nested_types[i], null_bytemap, builder.field_builder(i), format_name, start, end, output_string_as_string, dictionary_values);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = start; i != end; ++i)
|
for (size_t i = start; i != end; ++i)
|
||||||
@ -267,6 +270,7 @@ namespace DB
|
|||||||
String format_name,
|
String format_name,
|
||||||
size_t start,
|
size_t start,
|
||||||
size_t end,
|
size_t end,
|
||||||
|
bool output_string_as_string,
|
||||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
||||||
{
|
{
|
||||||
const auto * column_lc = assert_cast<const ColumnLowCardinality *>(column.get());
|
const auto * column_lc = assert_cast<const ColumnLowCardinality *>(column.get());
|
||||||
@ -284,7 +288,7 @@ namespace DB
|
|||||||
|
|
||||||
auto dict_column = column_lc->getDictionary().getNestedColumn();
|
auto dict_column = column_lc->getDictionary().getNestedColumn();
|
||||||
const auto & dict_type = assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType();
|
const auto & dict_type = assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType();
|
||||||
fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), dictionary_values);
|
fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), output_string_as_string, dictionary_values);
|
||||||
status = values_builder->Finish(&dict_values);
|
status = values_builder->Finish(&dict_values);
|
||||||
checkStatus(status, column->getName(), format_name);
|
checkStatus(status, column->getName(), format_name);
|
||||||
}
|
}
|
||||||
@ -321,6 +325,7 @@ namespace DB
|
|||||||
String format_name,
|
String format_name,
|
||||||
size_t start,
|
size_t start,
|
||||||
size_t end,
|
size_t end,
|
||||||
|
bool output_string_as_string,
|
||||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
||||||
{
|
{
|
||||||
auto value_type = assert_cast<arrow::DictionaryType *>(array_builder->type().get())->value_type();
|
auto value_type = assert_cast<arrow::DictionaryType *>(array_builder->type().get())->value_type();
|
||||||
@ -328,7 +333,7 @@ namespace DB
|
|||||||
#define DISPATCH(ARROW_TYPE_ID, ARROW_TYPE) \
|
#define DISPATCH(ARROW_TYPE_ID, ARROW_TYPE) \
|
||||||
if (arrow::Type::ARROW_TYPE_ID == value_type->id()) \
|
if (arrow::Type::ARROW_TYPE_ID == value_type->id()) \
|
||||||
{ \
|
{ \
|
||||||
fillArrowArrayWithLowCardinalityColumnDataImpl<ARROW_TYPE>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); \
|
fillArrowArrayWithLowCardinalityColumnDataImpl<ARROW_TYPE>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); \
|
||||||
return; \
|
return; \
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -338,7 +343,7 @@ namespace DB
|
|||||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot fill arrow array with {} data.", column_type->getName());
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot fill arrow array with {} data.", column_type->getName());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename ColumnType>
|
template <typename ColumnType, typename ArrowBuilder>
|
||||||
static void fillArrowArrayWithStringColumnData(
|
static void fillArrowArrayWithStringColumnData(
|
||||||
ColumnPtr write_column,
|
ColumnPtr write_column,
|
||||||
const PaddedPODArray<UInt8> * null_bytemap,
|
const PaddedPODArray<UInt8> * null_bytemap,
|
||||||
@ -348,7 +353,7 @@ namespace DB
|
|||||||
size_t end)
|
size_t end)
|
||||||
{
|
{
|
||||||
const auto & internal_column = assert_cast<const ColumnType &>(*write_column);
|
const auto & internal_column = assert_cast<const ColumnType &>(*write_column);
|
||||||
arrow::BinaryBuilder & builder = assert_cast<arrow::BinaryBuilder &>(*array_builder);
|
ArrowBuilder & builder = assert_cast<ArrowBuilder &>(*array_builder);
|
||||||
arrow::Status status;
|
arrow::Status status;
|
||||||
|
|
||||||
for (size_t string_i = start; string_i < end; ++string_i)
|
for (size_t string_i = start; string_i < end; ++string_i)
|
||||||
@ -442,6 +447,7 @@ namespace DB
|
|||||||
String format_name,
|
String format_name,
|
||||||
size_t start,
|
size_t start,
|
||||||
size_t end,
|
size_t end,
|
||||||
|
bool output_string_as_string,
|
||||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
||||||
{
|
{
|
||||||
const String column_type_name = column_type->getFamilyName();
|
const String column_type_name = column_type->getFamilyName();
|
||||||
@ -453,15 +459,21 @@ namespace DB
|
|||||||
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
|
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
|
||||||
const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr();
|
const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr();
|
||||||
const PaddedPODArray<UInt8> & bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData();
|
const PaddedPODArray<UInt8> & bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData();
|
||||||
fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, dictionary_values);
|
fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
|
||||||
}
|
}
|
||||||
else if (isString(column_type))
|
else if (isString(column_type))
|
||||||
{
|
{
|
||||||
fillArrowArrayWithStringColumnData<ColumnString>(column, null_bytemap, format_name, array_builder, start, end);
|
if (output_string_as_string)
|
||||||
|
fillArrowArrayWithStringColumnData<ColumnString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
else
|
||||||
|
fillArrowArrayWithStringColumnData<ColumnString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
}
|
}
|
||||||
else if (isFixedString(column_type))
|
else if (isFixedString(column_type))
|
||||||
{
|
{
|
||||||
fillArrowArrayWithStringColumnData<ColumnFixedString>(column, null_bytemap, format_name, array_builder, start, end);
|
if (output_string_as_string)
|
||||||
|
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
|
else
|
||||||
|
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||||
}
|
}
|
||||||
else if (isDate(column_type))
|
else if (isDate(column_type))
|
||||||
{
|
{
|
||||||
@ -477,21 +489,21 @@ namespace DB
|
|||||||
}
|
}
|
||||||
else if (isArray(column_type))
|
else if (isArray(column_type))
|
||||||
{
|
{
|
||||||
fillArrowArrayWithArrayColumnData<arrow::ListBuilder>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values);
|
fillArrowArrayWithArrayColumnData<arrow::ListBuilder>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
|
||||||
}
|
}
|
||||||
else if (isTuple(column_type))
|
else if (isTuple(column_type))
|
||||||
{
|
{
|
||||||
fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values);
|
fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
|
||||||
}
|
}
|
||||||
else if (column_type->getTypeId() == TypeIndex::LowCardinality)
|
else if (column_type->getTypeId() == TypeIndex::LowCardinality)
|
||||||
{
|
{
|
||||||
fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values);
|
fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
|
||||||
}
|
}
|
||||||
else if (isMap(column_type))
|
else if (isMap(column_type))
|
||||||
{
|
{
|
||||||
ColumnPtr column_array = assert_cast<const ColumnMap *>(column.get())->getNestedColumnPtr();
|
ColumnPtr column_array = assert_cast<const ColumnMap *>(column.get())->getNestedColumnPtr();
|
||||||
DataTypePtr array_type = assert_cast<const DataTypeMap *>(column_type.get())->getNestedType();
|
DataTypePtr array_type = assert_cast<const DataTypeMap *>(column_type.get())->getNestedType();
|
||||||
fillArrowArrayWithArrayColumnData<arrow::MapBuilder>(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, dictionary_values);
|
fillArrowArrayWithArrayColumnData<arrow::MapBuilder>(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
|
||||||
}
|
}
|
||||||
else if (isDecimal(column_type))
|
else if (isDecimal(column_type))
|
||||||
{
|
{
|
||||||
@ -603,13 +615,13 @@ namespace DB
|
|||||||
}
|
}
|
||||||
|
|
||||||
static std::shared_ptr<arrow::DataType> getArrowType(
|
static std::shared_ptr<arrow::DataType> getArrowType(
|
||||||
DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, bool * out_is_column_nullable)
|
DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, bool output_string_as_string, bool * out_is_column_nullable)
|
||||||
{
|
{
|
||||||
if (column_type->isNullable())
|
if (column_type->isNullable())
|
||||||
{
|
{
|
||||||
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
|
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
|
||||||
ColumnPtr nested_column = assert_cast<const ColumnNullable *>(column.get())->getNestedColumnPtr();
|
ColumnPtr nested_column = assert_cast<const ColumnNullable *>(column.get())->getNestedColumnPtr();
|
||||||
auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable);
|
auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable);
|
||||||
*out_is_column_nullable = true;
|
*out_is_column_nullable = true;
|
||||||
return arrow_type;
|
return arrow_type;
|
||||||
}
|
}
|
||||||
@ -643,7 +655,7 @@ namespace DB
|
|||||||
{
|
{
|
||||||
auto nested_type = assert_cast<const DataTypeArray *>(column_type.get())->getNestedType();
|
auto nested_type = assert_cast<const DataTypeArray *>(column_type.get())->getNestedType();
|
||||||
auto nested_column = assert_cast<const ColumnArray *>(column.get())->getDataPtr();
|
auto nested_column = assert_cast<const ColumnArray *>(column.get())->getDataPtr();
|
||||||
auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable);
|
auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable);
|
||||||
return arrow::list(nested_arrow_type);
|
return arrow::list(nested_arrow_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -655,7 +667,7 @@ namespace DB
|
|||||||
for (size_t i = 0; i != nested_types.size(); ++i)
|
for (size_t i = 0; i != nested_types.size(); ++i)
|
||||||
{
|
{
|
||||||
String name = column_name + "." + std::to_string(i);
|
String name = column_name + "." + std::to_string(i);
|
||||||
auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), name, format_name, out_is_column_nullable);
|
auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), name, format_name, output_string_as_string, out_is_column_nullable);
|
||||||
nested_fields.push_back(std::make_shared<arrow::Field>(name, nested_arrow_type, *out_is_column_nullable));
|
nested_fields.push_back(std::make_shared<arrow::Field>(name, nested_arrow_type, *out_is_column_nullable));
|
||||||
}
|
}
|
||||||
return arrow::struct_(nested_fields);
|
return arrow::struct_(nested_fields);
|
||||||
@ -669,7 +681,7 @@ namespace DB
|
|||||||
const auto & indexes_column = lc_column->getIndexesPtr();
|
const auto & indexes_column = lc_column->getIndexesPtr();
|
||||||
return arrow::dictionary(
|
return arrow::dictionary(
|
||||||
getArrowTypeForLowCardinalityIndexes(indexes_column),
|
getArrowTypeForLowCardinalityIndexes(indexes_column),
|
||||||
getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable));
|
getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isMap(column_type))
|
if (isMap(column_type))
|
||||||
@ -680,10 +692,19 @@ namespace DB
|
|||||||
|
|
||||||
const auto & columns = assert_cast<const ColumnMap *>(column.get())->getNestedData().getColumns();
|
const auto & columns = assert_cast<const ColumnMap *>(column.get())->getNestedData().getColumns();
|
||||||
return arrow::map(
|
return arrow::map(
|
||||||
getArrowType(key_type, columns[0], column_name, format_name, out_is_column_nullable),
|
getArrowType(key_type, columns[0], column_name, format_name, output_string_as_string, out_is_column_nullable),
|
||||||
getArrowType(val_type, columns[1], column_name, format_name, out_is_column_nullable));
|
getArrowType(val_type, columns[1], column_name, format_name, output_string_as_string, out_is_column_nullable));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (isDateTime64(column_type))
|
||||||
|
{
|
||||||
|
const auto * datetime64_type = assert_cast<const DataTypeDateTime64 *>(column_type.get());
|
||||||
|
return arrow::timestamp(getArrowTimeUnit(datetime64_type), datetime64_type->getTimeZone().getTimeZone());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isStringOrFixedString(column_type) && output_string_as_string)
|
||||||
|
return arrow::utf8();
|
||||||
|
|
||||||
const std::string type_name = column_type->getFamilyName();
|
const std::string type_name = column_type->getFamilyName();
|
||||||
if (const auto * arrow_type_it = std::find_if(
|
if (const auto * arrow_type_it = std::find_if(
|
||||||
internal_type_to_arrow_type.begin(),
|
internal_type_to_arrow_type.begin(),
|
||||||
@ -694,19 +715,13 @@ namespace DB
|
|||||||
return arrow_type_it->second;
|
return arrow_type_it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isDateTime64(column_type))
|
|
||||||
{
|
|
||||||
const auto * datetime64_type = assert_cast<const DataTypeDateTime64 *>(column_type.get());
|
|
||||||
return arrow::timestamp(getArrowTimeUnit(datetime64_type), datetime64_type->getTimeZone().getTimeZone());
|
|
||||||
}
|
|
||||||
|
|
||||||
throw Exception(ErrorCodes::UNKNOWN_TYPE,
|
throw Exception(ErrorCodes::UNKNOWN_TYPE,
|
||||||
"The type '{}' of a column '{}' is not supported for conversion into {} data format.",
|
"The type '{}' of a column '{}' is not supported for conversion into {} data format.",
|
||||||
column_type->getName(), column_name, format_name);
|
column_type->getName(), column_name, format_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
CHColumnToArrowColumn::CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_)
|
CHColumnToArrowColumn::CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_, bool output_string_as_string_)
|
||||||
: format_name(format_name_), low_cardinality_as_dictionary(low_cardinality_as_dictionary_)
|
: format_name(format_name_), low_cardinality_as_dictionary(low_cardinality_as_dictionary_), output_string_as_string(output_string_as_string_)
|
||||||
{
|
{
|
||||||
arrow_fields.reserve(header.columns());
|
arrow_fields.reserve(header.columns());
|
||||||
header_columns.reserve(header.columns());
|
header_columns.reserve(header.columns());
|
||||||
@ -741,7 +756,7 @@ namespace DB
|
|||||||
if (!is_arrow_fields_initialized)
|
if (!is_arrow_fields_initialized)
|
||||||
{
|
{
|
||||||
bool is_column_nullable = false;
|
bool is_column_nullable = false;
|
||||||
auto arrow_type = getArrowType(header_column.type, column, header_column.name, format_name, &is_column_nullable);
|
auto arrow_type = getArrowType(header_column.type, column, header_column.name, format_name, output_string_as_string, &is_column_nullable);
|
||||||
arrow_fields.emplace_back(std::make_shared<arrow::Field>(header_column.name, arrow_type, is_column_nullable));
|
arrow_fields.emplace_back(std::make_shared<arrow::Field>(header_column.name, arrow_type, is_column_nullable));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -751,7 +766,7 @@ namespace DB
|
|||||||
checkStatus(status, column->getName(), format_name);
|
checkStatus(status, column->getName(), format_name);
|
||||||
|
|
||||||
fillArrowArray(
|
fillArrowArray(
|
||||||
header_column.name, column, header_column.type, nullptr, array_builder.get(), format_name, 0, column->size(), dictionary_values);
|
header_column.name, column, header_column.type, nullptr, array_builder.get(), format_name, 0, column->size(), output_string_as_string, dictionary_values);
|
||||||
|
|
||||||
std::shared_ptr<arrow::Array> arrow_array;
|
std::shared_ptr<arrow::Array> arrow_array;
|
||||||
status = array_builder->Finish(&arrow_array);
|
status = array_builder->Finish(&arrow_array);
|
||||||
|
@ -14,7 +14,7 @@ namespace DB
|
|||||||
class CHColumnToArrowColumn
|
class CHColumnToArrowColumn
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_);
|
CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_, bool output_string_as_string_);
|
||||||
|
|
||||||
void chChunkToArrowTable(std::shared_ptr<arrow::Table> & res, const Chunk & chunk, size_t columns_num);
|
void chChunkToArrowTable(std::shared_ptr<arrow::Table> & res, const Chunk & chunk, size_t columns_num);
|
||||||
|
|
||||||
@ -32,6 +32,10 @@ private:
|
|||||||
/// because LowCardinality column from header always has indexes type UInt8, so, we should get
|
/// because LowCardinality column from header always has indexes type UInt8, so, we should get
|
||||||
/// proper indexes type from first chunk of data.
|
/// proper indexes type from first chunk of data.
|
||||||
bool is_arrow_fields_initialized = false;
|
bool is_arrow_fields_initialized = false;
|
||||||
|
|
||||||
|
/// Output columns with String data type as Arrow::String type.
|
||||||
|
/// By default Arrow::Binary is used.
|
||||||
|
bool output_string_as_string = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -100,6 +100,8 @@ ORC_UNIQUE_PTR<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr & t
|
|||||||
case TypeIndex::FixedString: [[fallthrough]];
|
case TypeIndex::FixedString: [[fallthrough]];
|
||||||
case TypeIndex::String:
|
case TypeIndex::String:
|
||||||
{
|
{
|
||||||
|
if (format_settings.orc.output_string_as_string)
|
||||||
|
return orc::createPrimitiveType(orc::TypeKind::STRING);
|
||||||
return orc::createPrimitiveType(orc::TypeKind::BINARY);
|
return orc::createPrimitiveType(orc::TypeKind::BINARY);
|
||||||
}
|
}
|
||||||
case TypeIndex::Nullable:
|
case TypeIndex::Nullable:
|
||||||
|
@ -29,7 +29,7 @@ void ParquetBlockOutputFormat::consume(Chunk chunk)
|
|||||||
if (!ch_column_to_arrow_column)
|
if (!ch_column_to_arrow_column)
|
||||||
{
|
{
|
||||||
const Block & header = getPort(PortKind::Main).getHeader();
|
const Block & header = getPort(PortKind::Main).getHeader();
|
||||||
ch_column_to_arrow_column = std::make_unique<CHColumnToArrowColumn>(header, "Parquet", false);
|
ch_column_to_arrow_column = std::make_unique<CHColumnToArrowColumn>(header, "Parquet", false, format_settings.parquet.output_string_as_string);
|
||||||
}
|
}
|
||||||
|
|
||||||
ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num);
|
ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num);
|
||||||
|
@ -0,0 +1,3 @@
|
|||||||
|
s Nullable(String)
|
||||||
|
s Nullable(String)
|
||||||
|
s Nullable(String)
|
@ -0,0 +1,6 @@
|
|||||||
|
insert into function file(data_02304.parquet) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_parquet_string_as_string=1;
|
||||||
|
desc file(data_02304.parquet);
|
||||||
|
insert into function file(data_02304.orc) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_orc_string_as_string=1;
|
||||||
|
desc file(data_02304.orc);
|
||||||
|
insert into function file(data_02304.arrow) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_arrow_string_as_string=1;
|
||||||
|
desc file(data_02304.arrow);
|
Loading…
Reference in New Issue
Block a user