mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Allow to use String type instead of Binary in Arrow/Parquet/ORC formats
This commit is contained in:
parent
a19d4c6f1f
commit
a0369fb9a6
@ -698,6 +698,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
|
||||
M(Bool, output_format_pretty_color, true, "Use ANSI escape sequences to paint colors in Pretty formats", 0) \
|
||||
M(String, output_format_pretty_grid_charset, "UTF-8", "Charset for printing grid borders. Available charsets: ASCII, UTF-8 (default one).", 0) \
|
||||
M(UInt64, output_format_parquet_row_group_size, 1000000, "Row group size in rows.", 0) \
|
||||
M(Bool, output_format_parquet_string_as_string, false, "Use Parquet String type instead of Binary for String columns.", 0) \
|
||||
M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \
|
||||
M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \
|
||||
M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \
|
||||
@ -735,6 +736,9 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
|
||||
M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if possible. Possible values: 0 - no rewrite, 1 - apply if possible, 2 - force rewrite all cross joins", 0) \
|
||||
\
|
||||
M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \
|
||||
M(Bool, output_format_arrow_string_as_string, false, "Use Arrow String type instead of Binary for String columns", 0) \
|
||||
\
|
||||
M(Bool, output_format_orc_string_as_string, false, "Use ORC String type instead of Binary for String columns", 0) \
|
||||
\
|
||||
M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \
|
||||
\
|
||||
|
@ -99,6 +99,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching;
|
||||
format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns;
|
||||
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string;
|
||||
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
|
||||
format_settings.pretty.color = settings.output_format_pretty_color;
|
||||
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
|
||||
@ -132,17 +133,19 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.arrow.import_nested = settings.input_format_arrow_import_nested;
|
||||
format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns;
|
||||
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching;
|
||||
format_settings.arrow.output_string_as_string = settings.output_format_arrow_string_as_string;
|
||||
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
|
||||
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
|
||||
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
|
||||
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching;
|
||||
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
|
||||
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
|
||||
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
|
||||
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
|
||||
format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string;
|
||||
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
|
||||
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
|
||||
format_settings.seekable_read = settings.input_format_allow_seeks;
|
||||
|
@ -81,6 +81,7 @@ struct FormatSettings
|
||||
bool allow_missing_columns = false;
|
||||
bool skip_columns_with_unsupported_types_in_schema_inference = false;
|
||||
bool case_insensitive_column_matching = false;
|
||||
bool output_string_as_string = false;
|
||||
} arrow;
|
||||
|
||||
struct
|
||||
@ -148,6 +149,7 @@ struct FormatSettings
|
||||
bool skip_columns_with_unsupported_types_in_schema_inference = false;
|
||||
bool case_insensitive_column_matching = false;
|
||||
std::unordered_set<int> skip_row_groups = {};
|
||||
bool output_string_as_string = false;
|
||||
} parquet;
|
||||
|
||||
struct Pretty
|
||||
@ -234,6 +236,7 @@ struct FormatSettings
|
||||
bool skip_columns_with_unsupported_types_in_schema_inference = false;
|
||||
bool case_insensitive_column_matching = false;
|
||||
std::unordered_set<int> skip_stripes = {};
|
||||
bool output_string_as_string = false;
|
||||
} orc;
|
||||
|
||||
/// For capnProto format we should determine how to
|
||||
|
@ -34,7 +34,7 @@ void ArrowBlockOutputFormat::consume(Chunk chunk)
|
||||
{
|
||||
const Block & header = getPort(PortKind::Main).getHeader();
|
||||
ch_column_to_arrow_column
|
||||
= std::make_unique<CHColumnToArrowColumn>(header, "Arrow", format_settings.arrow.low_cardinality_as_dictionary);
|
||||
= std::make_unique<CHColumnToArrowColumn>(header, "Arrow", format_settings.arrow.low_cardinality_as_dictionary, format_settings.arrow.output_string_as_string);
|
||||
}
|
||||
|
||||
ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num);
|
||||
|
@ -36,6 +36,8 @@
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
|
||||
#include <Common/logger_useful.h>
|
||||
|
||||
/// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn.
|
||||
#define FOR_ARROW_NUMERIC_TYPES(M) \
|
||||
M(arrow::Type::UINT8, DB::UInt8) \
|
||||
|
@ -168,6 +168,7 @@ namespace DB
|
||||
String format_name,
|
||||
size_t start,
|
||||
size_t end,
|
||||
bool output_string_as_string,
|
||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values);
|
||||
|
||||
template <typename Builder>
|
||||
@ -180,6 +181,7 @@ namespace DB
|
||||
String format_name,
|
||||
size_t start,
|
||||
size_t end,
|
||||
bool output_string_as_string,
|
||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
||||
{
|
||||
const auto * column_array = assert_cast<const ColumnArray *>(column.get());
|
||||
@ -196,7 +198,7 @@ namespace DB
|
||||
/// Start new array.
|
||||
components_status = builder.Append();
|
||||
checkStatus(components_status, nested_column->getName(), format_name);
|
||||
fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx], dictionary_values);
|
||||
fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx], output_string_as_string, dictionary_values);
|
||||
}
|
||||
}
|
||||
|
||||
@ -209,6 +211,7 @@ namespace DB
|
||||
String format_name,
|
||||
size_t start,
|
||||
size_t end,
|
||||
bool output_string_as_string,
|
||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
||||
{
|
||||
const auto * column_tuple = assert_cast<const ColumnTuple *>(column.get());
|
||||
@ -219,7 +222,7 @@ namespace DB
|
||||
for (size_t i = 0; i != column_tuple->tupleSize(); ++i)
|
||||
{
|
||||
ColumnPtr nested_column = column_tuple->getColumnPtr(i);
|
||||
fillArrowArray(column_name + "." + std::to_string(i), nested_column, nested_types[i], null_bytemap, builder.field_builder(i), format_name, start, end, dictionary_values);
|
||||
fillArrowArray(column_name + "." + std::to_string(i), nested_column, nested_types[i], null_bytemap, builder.field_builder(i), format_name, start, end, output_string_as_string, dictionary_values);
|
||||
}
|
||||
|
||||
for (size_t i = start; i != end; ++i)
|
||||
@ -267,6 +270,7 @@ namespace DB
|
||||
String format_name,
|
||||
size_t start,
|
||||
size_t end,
|
||||
bool output_string_as_string,
|
||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
||||
{
|
||||
const auto * column_lc = assert_cast<const ColumnLowCardinality *>(column.get());
|
||||
@ -284,7 +288,7 @@ namespace DB
|
||||
|
||||
auto dict_column = column_lc->getDictionary().getNestedColumn();
|
||||
const auto & dict_type = assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType();
|
||||
fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), dictionary_values);
|
||||
fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), output_string_as_string, dictionary_values);
|
||||
status = values_builder->Finish(&dict_values);
|
||||
checkStatus(status, column->getName(), format_name);
|
||||
}
|
||||
@ -321,6 +325,7 @@ namespace DB
|
||||
String format_name,
|
||||
size_t start,
|
||||
size_t end,
|
||||
bool output_string_as_string,
|
||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
||||
{
|
||||
auto value_type = assert_cast<arrow::DictionaryType *>(array_builder->type().get())->value_type();
|
||||
@ -328,7 +333,7 @@ namespace DB
|
||||
#define DISPATCH(ARROW_TYPE_ID, ARROW_TYPE) \
|
||||
if (arrow::Type::ARROW_TYPE_ID == value_type->id()) \
|
||||
{ \
|
||||
fillArrowArrayWithLowCardinalityColumnDataImpl<ARROW_TYPE>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); \
|
||||
fillArrowArrayWithLowCardinalityColumnDataImpl<ARROW_TYPE>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); \
|
||||
return; \
|
||||
}
|
||||
|
||||
@ -338,7 +343,7 @@ namespace DB
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot fill arrow array with {} data.", column_type->getName());
|
||||
}
|
||||
|
||||
template <typename ColumnType>
|
||||
template <typename ColumnType, typename ArrowBuilder>
|
||||
static void fillArrowArrayWithStringColumnData(
|
||||
ColumnPtr write_column,
|
||||
const PaddedPODArray<UInt8> * null_bytemap,
|
||||
@ -348,7 +353,7 @@ namespace DB
|
||||
size_t end)
|
||||
{
|
||||
const auto & internal_column = assert_cast<const ColumnType &>(*write_column);
|
||||
arrow::BinaryBuilder & builder = assert_cast<arrow::BinaryBuilder &>(*array_builder);
|
||||
ArrowBuilder & builder = assert_cast<ArrowBuilder &>(*array_builder);
|
||||
arrow::Status status;
|
||||
|
||||
for (size_t string_i = start; string_i < end; ++string_i)
|
||||
@ -442,6 +447,7 @@ namespace DB
|
||||
String format_name,
|
||||
size_t start,
|
||||
size_t end,
|
||||
bool output_string_as_string,
|
||||
std::unordered_map<String, std::shared_ptr<arrow::Array>> & dictionary_values)
|
||||
{
|
||||
const String column_type_name = column_type->getFamilyName();
|
||||
@ -453,15 +459,21 @@ namespace DB
|
||||
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
|
||||
const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr();
|
||||
const PaddedPODArray<UInt8> & bytemap = assert_cast<const ColumnVector<UInt8> &>(*null_column).getData();
|
||||
fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, dictionary_values);
|
||||
fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
|
||||
}
|
||||
else if (isString(column_type))
|
||||
{
|
||||
fillArrowArrayWithStringColumnData<ColumnString>(column, null_bytemap, format_name, array_builder, start, end);
|
||||
if (output_string_as_string)
|
||||
fillArrowArrayWithStringColumnData<ColumnString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||
else
|
||||
fillArrowArrayWithStringColumnData<ColumnString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||
}
|
||||
else if (isFixedString(column_type))
|
||||
{
|
||||
fillArrowArrayWithStringColumnData<ColumnFixedString>(column, null_bytemap, format_name, array_builder, start, end);
|
||||
if (output_string_as_string)
|
||||
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::StringBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||
else
|
||||
fillArrowArrayWithStringColumnData<ColumnFixedString, arrow::BinaryBuilder>(column, null_bytemap, format_name, array_builder, start, end);
|
||||
}
|
||||
else if (isDate(column_type))
|
||||
{
|
||||
@ -477,21 +489,21 @@ namespace DB
|
||||
}
|
||||
else if (isArray(column_type))
|
||||
{
|
||||
fillArrowArrayWithArrayColumnData<arrow::ListBuilder>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values);
|
||||
fillArrowArrayWithArrayColumnData<arrow::ListBuilder>(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
|
||||
}
|
||||
else if (isTuple(column_type))
|
||||
{
|
||||
fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values);
|
||||
fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
|
||||
}
|
||||
else if (column_type->getTypeId() == TypeIndex::LowCardinality)
|
||||
{
|
||||
fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values);
|
||||
fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
|
||||
}
|
||||
else if (isMap(column_type))
|
||||
{
|
||||
ColumnPtr column_array = assert_cast<const ColumnMap *>(column.get())->getNestedColumnPtr();
|
||||
DataTypePtr array_type = assert_cast<const DataTypeMap *>(column_type.get())->getNestedType();
|
||||
fillArrowArrayWithArrayColumnData<arrow::MapBuilder>(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, dictionary_values);
|
||||
fillArrowArrayWithArrayColumnData<arrow::MapBuilder>(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values);
|
||||
}
|
||||
else if (isDecimal(column_type))
|
||||
{
|
||||
@ -603,13 +615,13 @@ namespace DB
|
||||
}
|
||||
|
||||
static std::shared_ptr<arrow::DataType> getArrowType(
|
||||
DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, bool * out_is_column_nullable)
|
||||
DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, bool output_string_as_string, bool * out_is_column_nullable)
|
||||
{
|
||||
if (column_type->isNullable())
|
||||
{
|
||||
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
|
||||
ColumnPtr nested_column = assert_cast<const ColumnNullable *>(column.get())->getNestedColumnPtr();
|
||||
auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable);
|
||||
auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable);
|
||||
*out_is_column_nullable = true;
|
||||
return arrow_type;
|
||||
}
|
||||
@ -643,7 +655,7 @@ namespace DB
|
||||
{
|
||||
auto nested_type = assert_cast<const DataTypeArray *>(column_type.get())->getNestedType();
|
||||
auto nested_column = assert_cast<const ColumnArray *>(column.get())->getDataPtr();
|
||||
auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable);
|
||||
auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable);
|
||||
return arrow::list(nested_arrow_type);
|
||||
}
|
||||
|
||||
@ -655,7 +667,7 @@ namespace DB
|
||||
for (size_t i = 0; i != nested_types.size(); ++i)
|
||||
{
|
||||
String name = column_name + "." + std::to_string(i);
|
||||
auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), name, format_name, out_is_column_nullable);
|
||||
auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), name, format_name, output_string_as_string, out_is_column_nullable);
|
||||
nested_fields.push_back(std::make_shared<arrow::Field>(name, nested_arrow_type, *out_is_column_nullable));
|
||||
}
|
||||
return arrow::struct_(nested_fields);
|
||||
@ -669,7 +681,7 @@ namespace DB
|
||||
const auto & indexes_column = lc_column->getIndexesPtr();
|
||||
return arrow::dictionary(
|
||||
getArrowTypeForLowCardinalityIndexes(indexes_column),
|
||||
getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable));
|
||||
getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable));
|
||||
}
|
||||
|
||||
if (isMap(column_type))
|
||||
@ -680,10 +692,19 @@ namespace DB
|
||||
|
||||
const auto & columns = assert_cast<const ColumnMap *>(column.get())->getNestedData().getColumns();
|
||||
return arrow::map(
|
||||
getArrowType(key_type, columns[0], column_name, format_name, out_is_column_nullable),
|
||||
getArrowType(val_type, columns[1], column_name, format_name, out_is_column_nullable));
|
||||
getArrowType(key_type, columns[0], column_name, format_name, output_string_as_string, out_is_column_nullable),
|
||||
getArrowType(val_type, columns[1], column_name, format_name, output_string_as_string, out_is_column_nullable));
|
||||
}
|
||||
|
||||
if (isDateTime64(column_type))
|
||||
{
|
||||
const auto * datetime64_type = assert_cast<const DataTypeDateTime64 *>(column_type.get());
|
||||
return arrow::timestamp(getArrowTimeUnit(datetime64_type), datetime64_type->getTimeZone().getTimeZone());
|
||||
}
|
||||
|
||||
if (isStringOrFixedString(column_type) && output_string_as_string)
|
||||
return arrow::utf8();
|
||||
|
||||
const std::string type_name = column_type->getFamilyName();
|
||||
if (const auto * arrow_type_it = std::find_if(
|
||||
internal_type_to_arrow_type.begin(),
|
||||
@ -694,19 +715,13 @@ namespace DB
|
||||
return arrow_type_it->second;
|
||||
}
|
||||
|
||||
if (isDateTime64(column_type))
|
||||
{
|
||||
const auto * datetime64_type = assert_cast<const DataTypeDateTime64 *>(column_type.get());
|
||||
return arrow::timestamp(getArrowTimeUnit(datetime64_type), datetime64_type->getTimeZone().getTimeZone());
|
||||
}
|
||||
|
||||
throw Exception(ErrorCodes::UNKNOWN_TYPE,
|
||||
"The type '{}' of a column '{}' is not supported for conversion into {} data format.",
|
||||
column_type->getName(), column_name, format_name);
|
||||
}
|
||||
|
||||
CHColumnToArrowColumn::CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_)
|
||||
: format_name(format_name_), low_cardinality_as_dictionary(low_cardinality_as_dictionary_)
|
||||
CHColumnToArrowColumn::CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_, bool output_string_as_string_)
|
||||
: format_name(format_name_), low_cardinality_as_dictionary(low_cardinality_as_dictionary_), output_string_as_string(output_string_as_string_)
|
||||
{
|
||||
arrow_fields.reserve(header.columns());
|
||||
header_columns.reserve(header.columns());
|
||||
@ -741,7 +756,7 @@ namespace DB
|
||||
if (!is_arrow_fields_initialized)
|
||||
{
|
||||
bool is_column_nullable = false;
|
||||
auto arrow_type = getArrowType(header_column.type, column, header_column.name, format_name, &is_column_nullable);
|
||||
auto arrow_type = getArrowType(header_column.type, column, header_column.name, format_name, output_string_as_string, &is_column_nullable);
|
||||
arrow_fields.emplace_back(std::make_shared<arrow::Field>(header_column.name, arrow_type, is_column_nullable));
|
||||
}
|
||||
|
||||
@ -751,7 +766,7 @@ namespace DB
|
||||
checkStatus(status, column->getName(), format_name);
|
||||
|
||||
fillArrowArray(
|
||||
header_column.name, column, header_column.type, nullptr, array_builder.get(), format_name, 0, column->size(), dictionary_values);
|
||||
header_column.name, column, header_column.type, nullptr, array_builder.get(), format_name, 0, column->size(), output_string_as_string, dictionary_values);
|
||||
|
||||
std::shared_ptr<arrow::Array> arrow_array;
|
||||
status = array_builder->Finish(&arrow_array);
|
||||
|
@ -14,7 +14,7 @@ namespace DB
|
||||
class CHColumnToArrowColumn
|
||||
{
|
||||
public:
|
||||
CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_);
|
||||
CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_, bool output_string_as_string_);
|
||||
|
||||
void chChunkToArrowTable(std::shared_ptr<arrow::Table> & res, const Chunk & chunk, size_t columns_num);
|
||||
|
||||
@ -32,6 +32,10 @@ private:
|
||||
/// because LowCardinality column from header always has indexes type UInt8, so, we should get
|
||||
/// proper indexes type from first chunk of data.
|
||||
bool is_arrow_fields_initialized = false;
|
||||
|
||||
/// Output columns with String data type as Arrow::String type.
|
||||
/// By default Arrow::Binary is used.
|
||||
bool output_string_as_string = false;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -100,6 +100,8 @@ ORC_UNIQUE_PTR<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr & t
|
||||
case TypeIndex::FixedString: [[fallthrough]];
|
||||
case TypeIndex::String:
|
||||
{
|
||||
if (format_settings.orc.output_string_as_string)
|
||||
return orc::createPrimitiveType(orc::TypeKind::STRING);
|
||||
return orc::createPrimitiveType(orc::TypeKind::BINARY);
|
||||
}
|
||||
case TypeIndex::Nullable:
|
||||
|
@ -29,7 +29,7 @@ void ParquetBlockOutputFormat::consume(Chunk chunk)
|
||||
if (!ch_column_to_arrow_column)
|
||||
{
|
||||
const Block & header = getPort(PortKind::Main).getHeader();
|
||||
ch_column_to_arrow_column = std::make_unique<CHColumnToArrowColumn>(header, "Parquet", false);
|
||||
ch_column_to_arrow_column = std::make_unique<CHColumnToArrowColumn>(header, "Parquet", false, format_settings.parquet.output_string_as_string);
|
||||
}
|
||||
|
||||
ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num);
|
||||
|
@ -0,0 +1,3 @@
|
||||
s Nullable(String)
|
||||
s Nullable(String)
|
||||
s Nullable(String)
|
@ -0,0 +1,6 @@
|
||||
insert into function file(data_02304.parquet) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_parquet_string_as_string=1;
|
||||
desc file(data_02304.parquet);
|
||||
insert into function file(data_02304.orc) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_orc_string_as_string=1;
|
||||
desc file(data_02304.orc);
|
||||
insert into function file(data_02304.arrow) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_arrow_string_as_string=1;
|
||||
desc file(data_02304.arrow);
|
Loading…
Reference in New Issue
Block a user