From 2e9858172e9e5425f2260fe8af408c67aa2621cd Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Wed, 15 Sep 2021 22:04:36 +0300 Subject: [PATCH] Allow to input and output LowCardinality columns in ORC format --- .../Formats/Impl/ORCBlockOutputFormat.cpp | 16 +++++++++++----- .../02029_orc_low_cardinality.reference | 10 ++++++++++ .../0_stateless/02029_orc_low_cardinality.sh | 16 ++++++++++++++++ 3 files changed, 37 insertions(+), 5 deletions(-) create mode 100644 tests/queries/0_stateless/02029_orc_low_cardinality.reference create mode 100755 tests/queries/0_stateless/02029_orc_low_cardinality.sh diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index a5143792e7d..84338d3b23d 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,7 @@ #include #include #include +#include namespace DB { @@ -48,8 +50,10 @@ void ORCOutputStream::write(const void* buf, size_t length) } ORCBlockOutputFormat::ORCBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_) - : IOutputFormat(header_, out_), format_settings{format_settings_}, output_stream(out_), data_types(header_.getDataTypes()) + : IOutputFormat(header_, out_), format_settings{format_settings_}, output_stream(out_) { + for (const auto & type : header_.getDataTypes()) + data_types.push_back(recursiveRemoveLowCardinality(type)); } ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & type, const std::string & column_name) @@ -482,10 +486,12 @@ void ORCBlockOutputFormat::consume(Chunk chunk) /// The size of the batch must be no less than total amount of array elements. ORC_UNIQUE_PTR batch = writer->createRowBatch(getMaxColumnSize(chunk)); orc::StructVectorBatch & root = dynamic_cast(*batch); + auto columns = chunk.getColumns(); + for (auto & column : columns) + column = recursiveRemoveLowCardinality(column); + for (size_t i = 0; i != columns_num; ++i) - { - writeColumn(*root.fields[i], *chunk.getColumns()[i], data_types[i], nullptr); - } + writeColumn(*root.fields[i], *columns[i], data_types[i], nullptr); root.numElements = rows_num; writer->add(*batch); } @@ -505,7 +511,7 @@ void ORCBlockOutputFormat::prepareWriter() options.setCompression(orc::CompressionKind::CompressionKind_NONE); size_t columns_count = header.columns(); for (size_t i = 0; i != columns_count; ++i) - schema->addStructField(header.safeGetByPosition(i).name, getORCType(data_types[i], header.safeGetByPosition(i).name)); + schema->addStructField(header.safeGetByPosition(i).name, getORCType(recursiveRemoveLowCardinality(data_types[i]), header.safeGetByPosition(i).name)); writer = orc::createWriter(*schema, &output_stream, options); } diff --git a/tests/queries/0_stateless/02029_orc_low_cardinality.reference b/tests/queries/0_stateless/02029_orc_low_cardinality.reference new file mode 100644 index 00000000000..51537082c87 --- /dev/null +++ b/tests/queries/0_stateless/02029_orc_low_cardinality.reference @@ -0,0 +1,10 @@ +0 ['0'] ('0') +1 ['1'] ('1') +2 ['2'] ('2') +3 ['3'] ('3') +4 ['4'] ('4') +5 ['5'] ('5') +6 ['6'] ('6') +7 ['7'] ('7') +8 ['8'] ('8') +9 ['9'] ('9') diff --git a/tests/queries/0_stateless/02029_orc_low_cardinality.sh b/tests/queries/0_stateless/02029_orc_low_cardinality.sh new file mode 100755 index 00000000000..44584a8d969 --- /dev/null +++ b/tests/queries/0_stateless/02029_orc_low_cardinality.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# Tags: no-unbundled, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS orc_lc"; + + +$CLICKHOUSE_CLIENT --query="CREATE TABLE orc_lc (lc LowCardinality(String), array_lc Array(LowCardinality(String)), tuple_lc Tuple(LowCardinality(String))) ENGINE = Memory()"; + + +$CLICKHOUSE_CLIENT --query="SELECT [lc] as array_lc, tuple(lc) as tuple_lc, toLowCardinality(toString(number)) as lc from numbers(10) FORMAT ORC" | $CLICKHOUSE_CLIENT --query="INSERT INTO orc_lc FORMAT ORC"; + +$CLICKHOUSE_CLIENT --query="SELECT * FROM orc_lc";