From 5ab5902f38906254af902d01cc71851f6cb78401 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 1 Mar 2023 21:27:46 +0000 Subject: [PATCH 1/8] Allow control compression in Parquet/ORC/Arrow output formats, support more compression for input formats --- contrib/arrow-cmake/CMakeLists.txt | 20 +++++++++- contrib/orc | 2 +- src/Core/Settings.h | 3 ++ src/Core/SettingsEnums.cpp | 23 +++++++++++- src/Core/SettingsEnums.h | 6 +++ src/Formats/FormatFactory.cpp | 3 ++ src/Formats/FormatSettings.h | 29 +++++++++++++++ .../Formats/Impl/ArrowBlockOutputFormat.cpp | 26 ++++++++++++- .../Formats/Impl/ORCBlockOutputFormat.cpp | 32 +++++++++++++++- .../Formats/Impl/ParquetBlockOutputFormat.cpp | 37 +++++++++++++++++-- 10 files changed, 171 insertions(+), 10 deletions(-) diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index ae6f270a768..4181f916d63 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -115,6 +115,13 @@ configure_file("${ORC_SOURCE_SRC_DIR}/Adaptor.hh.in" "${ORC_BUILD_INCLUDE_DIR}/A # ARROW_ORC + adapters/orc/CMakefiles set(ORC_SRCS + "${CMAKE_CURRENT_BINARY_DIR}/orc_proto.pb.h" + "${ORC_SOURCE_SRC_DIR}/sargs/ExpressionTree.cc" + "${ORC_SOURCE_SRC_DIR}/sargs/Literal.cc" + "${ORC_SOURCE_SRC_DIR}/sargs/PredicateLeaf.cc" + "${ORC_SOURCE_SRC_DIR}/sargs/SargsApplier.cc" + "${ORC_SOURCE_SRC_DIR}/sargs/SearchArgument.cc" + "${ORC_SOURCE_SRC_DIR}/sargs/TruthValue.cc" "${ORC_SOURCE_SRC_DIR}/Exceptions.cc" "${ORC_SOURCE_SRC_DIR}/OrcFile.cc" "${ORC_SOURCE_SRC_DIR}/Reader.cc" @@ -129,13 +136,20 @@ set(ORC_SRCS "${ORC_SOURCE_SRC_DIR}/MemoryPool.cc" "${ORC_SOURCE_SRC_DIR}/RLE.cc" "${ORC_SOURCE_SRC_DIR}/RLEv1.cc" - "${ORC_SOURCE_SRC_DIR}/RLEv2.cc" + "${ORC_SOURCE_SRC_DIR}/RleDecoderV2.cc" + "${ORC_SOURCE_SRC_DIR}/RleEncoderV2.cc" + "${ORC_SOURCE_SRC_DIR}/RLEV2Util.cc" "${ORC_SOURCE_SRC_DIR}/Statistics.cc" "${ORC_SOURCE_SRC_DIR}/StripeStream.cc" "${ORC_SOURCE_SRC_DIR}/Timezone.cc" "${ORC_SOURCE_SRC_DIR}/TypeImpl.cc" "${ORC_SOURCE_SRC_DIR}/Vector.cc" "${ORC_SOURCE_SRC_DIR}/Writer.cc" + "${ORC_SOURCE_SRC_DIR}/Adaptor.cc" + "${ORC_SOURCE_SRC_DIR}/BloomFilter.cc" + "${ORC_SOURCE_SRC_DIR}/Murmur3.cc" + "${ORC_SOURCE_SRC_DIR}/BlockBuffer.cc" + "${ORC_SOURCE_SRC_DIR}/wrap/orc-proto-wrapper.cc" "${ORC_SOURCE_SRC_DIR}/io/InputStream.cc" "${ORC_SOURCE_SRC_DIR}/io/OutputStream.cc" "${ORC_ADDITION_SOURCE_DIR}/orc_proto.pb.cc" @@ -358,6 +372,9 @@ SET(ARROW_SRCS "${LIBRARY_DIR}/util/compression_zlib.cc" ${ARROW_SRCS}) add_definitions(-DARROW_WITH_ZSTD) SET(ARROW_SRCS "${LIBRARY_DIR}/util/compression_zstd.cc" ${ARROW_SRCS}) +add_definitions(-DARROW_WITH_BROTLI) +SET(ARROW_SRCS "${LIBRARY_DIR}/util/compression_brotli.cc" ${ARROW_SRCS}) + add_library(_arrow ${ARROW_SRCS}) @@ -372,6 +389,7 @@ target_link_libraries(_arrow PRIVATE ch_contrib::snappy ch_contrib::zlib ch_contrib::zstd + ch_contrib::brotli ) target_link_libraries(_arrow PUBLIC _orc) diff --git a/contrib/orc b/contrib/orc index f9a393ed243..c5d7755ba0b 160000 --- a/contrib/orc +++ b/contrib/orc @@ -1 +1 @@ -Subproject commit f9a393ed2433a60034795284f82d093b348f2102 +Subproject commit c5d7755ba0b9a95631c8daea4d094101f26ec761 diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 3908254b6f1..8d3e787f42c 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -857,6 +857,7 @@ class IColumn; M(Bool, output_format_parquet_string_as_string, false, "Use Parquet String type instead of Binary for String columns.", 0) \ M(Bool, output_format_parquet_fixed_string_as_fixed_byte_array, true, "Use Parquet FIXED_LENGTH_BYTE_ARRAY type instead of Binary for FixedString columns.", 0) \ M(ParquetVersion, output_format_parquet_version, "2.latest", "Parquet format version for output format. Supported versions: 1.0, 2.4, 2.6 and 2.latest (default)", 0) \ + M(ParquetCompression, output_format_parquet_compression_method, "snappy", "Compression method for Parquet output format. Supported codecs: snappy, lz4, brotli, zstd, gzip, none (uncompressed)", 0) \ M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \ M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \ M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \ @@ -899,8 +900,10 @@ class IColumn; M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \ M(Bool, output_format_arrow_string_as_string, false, "Use Arrow String type instead of Binary for String columns", 0) \ M(Bool, output_format_arrow_fixed_string_as_fixed_byte_array, true, "Use Arrow FIXED_SIZE_BINARY type instead of Binary for FixedString columns.", 0) \ + M(ArrowCompression, output_format_arrow_compression_method, "none", "Compression method for Arrow output format. Supported codecs: lz4_frame, zstd, none (uncompressed)", 0) \ \ M(Bool, output_format_orc_string_as_string, false, "Use ORC String type instead of Binary for String columns", 0) \ + M(ORCCompression, output_format_orc_compression_method, "none", "Compression method for ORC output format. Supported codecs: lz4, snappy, zlib, zstd, none (uncompressed)", 0) \ \ M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ \ diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index 9e1ab585bb0..91572aa1b3f 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -158,7 +158,7 @@ IMPLEMENT_SETTING_ENUM(EscapingRule, ErrorCodes::BAD_ARGUMENTS, {"XML", FormatSettings::EscapingRule::XML}, {"Raw", FormatSettings::EscapingRule::Raw}}) -IMPLEMENT_SETTING_ENUM(MsgPackUUIDRepresentation , ErrorCodes::BAD_ARGUMENTS, +IMPLEMENT_SETTING_ENUM(MsgPackUUIDRepresentation, ErrorCodes::BAD_ARGUMENTS, {{"bin", FormatSettings::MsgPackUUIDRepresentation::BIN}, {"str", FormatSettings::MsgPackUUIDRepresentation::STR}, {"ext", FormatSettings::MsgPackUUIDRepresentation::EXT}}) @@ -172,11 +172,30 @@ IMPLEMENT_SETTING_ENUM(LocalFSReadMethod, ErrorCodes::BAD_ARGUMENTS, {"pread", LocalFSReadMethod::pread}, {"read", LocalFSReadMethod::read}}) - IMPLEMENT_SETTING_ENUM_WITH_RENAME(ParquetVersion, ErrorCodes::BAD_ARGUMENTS, {{"1.0", FormatSettings::ParquetVersion::V1_0}, {"2.4", FormatSettings::ParquetVersion::V2_4}, {"2.6", FormatSettings::ParquetVersion::V2_6}, {"2.latest", FormatSettings::ParquetVersion::V2_LATEST}}) +IMPLEMENT_SETTING_ENUM(ParquetCompression, ErrorCodes::BAD_ARGUMENTS, + {{"none", FormatSettings::ParquetCompression::NONE}, + {"snappy", FormatSettings::ParquetCompression::SNAPPY}, + {"zstd", FormatSettings::ParquetCompression::ZSTD}, + {"gzip", FormatSettings::ParquetCompression::GZIP}, + {"lz4", FormatSettings::ParquetCompression::LZ4}, + {"brotli", FormatSettings::ParquetCompression::BROTLI}}) + +IMPLEMENT_SETTING_ENUM(ArrowCompression, ErrorCodes::BAD_ARGUMENTS, + {{"none", FormatSettings::ArrowCompression::NONE}, + {"lz4_frame", FormatSettings::ArrowCompression::LZ4_FRAME}, + {"zstd", FormatSettings::ArrowCompression::ZSTD}}) + +IMPLEMENT_SETTING_ENUM(ORCCompression, ErrorCodes::BAD_ARGUMENTS, + {{"none", FormatSettings::ORCCompression::NONE}, + {"snappy", FormatSettings::ORCCompression::SNAPPY}, + {"zstd", FormatSettings::ORCCompression::ZSTD}, + {"zlib", FormatSettings::ORCCompression::ZLIB}, + {"lz4", FormatSettings::ORCCompression::LZ4}}) + } diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index 139a04f3a5a..14e952bbd65 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -194,6 +194,12 @@ DECLARE_SETTING_ENUM_WITH_RENAME(EscapingRule, FormatSettings::EscapingRule) DECLARE_SETTING_ENUM_WITH_RENAME(MsgPackUUIDRepresentation, FormatSettings::MsgPackUUIDRepresentation) +DECLARE_SETTING_ENUM_WITH_RENAME(ParquetCompression, FormatSettings::ParquetCompression) + +DECLARE_SETTING_ENUM_WITH_RENAME(ArrowCompression, FormatSettings::ArrowCompression) + +DECLARE_SETTING_ENUM_WITH_RENAME(ORCCompression, FormatSettings::ORCCompression) + enum class Dialect { clickhouse, diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index aca3166a8c4..7f14810b260 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -118,6 +118,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string; format_settings.parquet.output_fixed_string_as_fixed_byte_array = settings.output_format_parquet_fixed_string_as_fixed_byte_array; format_settings.parquet.max_block_size = settings.input_format_parquet_max_block_size; + format_settings.parquet.output_compression_method = settings.output_format_parquet_compression_method; format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8; format_settings.pretty.color = settings.output_format_pretty_color; format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width; @@ -158,6 +159,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching; format_settings.arrow.output_string_as_string = settings.output_format_arrow_string_as_string; format_settings.arrow.output_fixed_string_as_fixed_byte_array = settings.output_format_arrow_fixed_string_as_fixed_byte_array; + format_settings.arrow.output_compression_method = settings.output_format_arrow_compression_method; format_settings.orc.import_nested = settings.input_format_orc_import_nested; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; @@ -168,6 +170,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching; format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string; + format_settings.orc.output_compression_method = settings.output_format_orc_compression_method; format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index d1755a35c5f..88a5adbc8df 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -86,6 +86,13 @@ struct FormatSettings UInt64 max_parser_depth = DBMS_DEFAULT_MAX_PARSER_DEPTH; + enum class ArrowCompression + { + NONE, + LZ4_FRAME, + ZSTD + }; + struct { UInt64 row_group_size = 1000000; @@ -96,6 +103,7 @@ struct FormatSettings bool case_insensitive_column_matching = false; bool output_string_as_string = false; bool output_fixed_string_as_fixed_byte_array = true; + ArrowCompression output_compression_method = ArrowCompression::NONE; } arrow; struct @@ -183,6 +191,16 @@ struct FormatSettings V2_LATEST, }; + enum class ParquetCompression + { + NONE, + SNAPPY, + ZSTD, + LZ4, + GZIP, + BROTLI, + }; + struct { UInt64 row_group_size = 1000000; @@ -195,6 +213,7 @@ struct FormatSettings bool output_fixed_string_as_fixed_byte_array = true; UInt64 max_block_size = 8192; ParquetVersion output_version; + ParquetCompression output_compression_method = ParquetCompression::SNAPPY; } parquet; struct Pretty @@ -276,6 +295,15 @@ struct FormatSettings bool accurate_types_of_literals = true; } values; + enum class ORCCompression + { + NONE, + LZ4, + SNAPPY, + ZSTD, + ZLIB, + }; + struct { bool import_nested = false; @@ -285,6 +313,7 @@ struct FormatSettings bool case_insensitive_column_matching = false; std::unordered_set skip_stripes = {}; bool output_string_as_string = false; + ORCCompression output_compression_method = ORCCompression::NONE; } orc; /// For capnProto format we should determine how to diff --git a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp index bf0e2448082..ec35c52e37c 100644 --- a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp @@ -8,6 +8,7 @@ #include #include "ArrowBufferedStreams.h" #include "CHColumnToArrowColumn.h" +#include "config.h" namespace DB @@ -17,6 +18,25 @@ namespace ErrorCodes extern const int UNKNOWN_EXCEPTION; } +namespace +{ + +arrow::Compression::type getArrowCompression(FormatSettings::ArrowCompression method) +{ + switch (method) + { + case FormatSettings::ArrowCompression::NONE: + return arrow::Compression::type::UNCOMPRESSED; + case FormatSettings::ArrowCompression::ZSTD: + return arrow::Compression::type::ZSTD; + case FormatSettings::ArrowCompression::LZ4_FRAME: + return arrow::Compression::type::LZ4_FRAME; + } +} + +} + + ArrowBlockOutputFormat::ArrowBlockOutputFormat(WriteBuffer & out_, const Block & header_, bool stream_, const FormatSettings & format_settings_) : IOutputFormat(header_, out_) , stream{stream_} @@ -78,12 +98,14 @@ void ArrowBlockOutputFormat::prepareWriter(const std::shared_ptr { arrow_ostream = std::make_shared(out); arrow::Result> writer_status; + arrow::ipc::IpcWriteOptions options = arrow::ipc::IpcWriteOptions::Defaults(); + options.codec = *arrow::util::Codec::Create(getArrowCompression(format_settings.arrow.output_compression_method)); // TODO: should we use arrow::ipc::IpcOptions::alignment? if (stream) - writer_status = arrow::ipc::MakeStreamWriter(arrow_ostream.get(), schema); + writer_status = arrow::ipc::MakeStreamWriter(arrow_ostream.get(), schema, options); else - writer_status = arrow::ipc::MakeFileWriter(arrow_ostream.get(), schema); + writer_status = arrow::ipc::MakeFileWriter(arrow_ostream.get(), schema,options); if (!writer_status.ok()) throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 42c3e178436..ecb7c2fbc92 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -22,12 +22,42 @@ #include #include +#include "config.h" + namespace DB { namespace ErrorCodes { extern const int ILLEGAL_COLUMN; + extern const int NOT_IMPLEMENTED; +} + +namespace +{ + +orc::CompressionKind getORCCompression(FormatSettings::ORCCompression method) +{ + if (method == FormatSettings::ORCCompression::NONE) + return orc::CompressionKind::CompressionKind_NONE; + +#if USE_SNAPPY + if (method == FormatSettings::ORCCompression::SNAPPY) + return orc::CompressionKind::CompressionKind_SNAPPY; +#endif + + if (method == FormatSettings::ORCCompression::ZSTD) + return orc::CompressionKind::CompressionKind_ZSTD; + + if (method == FormatSettings::ORCCompression::LZ4) + return orc::CompressionKind::CompressionKind_LZ4; + + if (method == FormatSettings::ORCCompression::ZLIB) + return orc::CompressionKind::CompressionKind_ZLIB; + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method"); +} + } ORCOutputStream::ORCOutputStream(WriteBuffer & out_) : out(out_) {} @@ -529,7 +559,7 @@ void ORCBlockOutputFormat::prepareWriter() { const Block & header = getPort(PortKind::Main).getHeader(); schema = orc::createStructType(); - options.setCompression(orc::CompressionKind::CompressionKind_NONE); + options.setCompression(getORCCompression(format_settings.orc.output_compression_method)); size_t columns_count = header.columns(); for (size_t i = 0; i != columns_count; ++i) schema->addStructField(header.safeGetByPosition(i).name, getORCType(recursiveRemoveLowCardinality(data_types[i]))); diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index 18c81f8fd6a..742912df980 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -16,6 +16,9 @@ namespace ErrorCodes extern const int UNKNOWN_EXCEPTION; } +namespace +{ + static parquet::ParquetVersion::type getParquetVersion(const FormatSettings & settings) { switch (settings.parquet.output_version) @@ -31,6 +34,36 @@ static parquet::ParquetVersion::type getParquetVersion(const FormatSettings & se } } +parquet::Compression::type getParquetCompression(FormatSettings::ParquetCompression method) +{ + if (method == FormatSettings::ParquetCompression::NONE) + return parquet::Compression::type::UNCOMPRESSED; + +#if USE_SNAPPY + if (method == FormatSettings::ParquetCompression::SNAPPY) + return parquet::Compression::type::SNAPPY; +#endif + +#if USE_BROTLI + if (method == FormatSettings::ParquetCompression::BROTLI) + return parquet::Compression::type::BROTLI; +#endif + + if (method == FormatSettings::ParquetCompression::ZSTD) + return parquet::Compression::type::ZSTD; + + if (method == FormatSettings::ParquetCompression::LZ4) + return parquet::Compression::type::LZ4; + + if (method == FormatSettings::ParquetCompression::GZIP) + return parquet::Compression::type::GZIP; + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method"); +} + + +} + ParquetBlockOutputFormat::ParquetBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_) : IOutputFormat(header_, out_), format_settings{format_settings_} { @@ -60,9 +93,7 @@ void ParquetBlockOutputFormat::consume(Chunk chunk) parquet::WriterProperties::Builder builder; builder.version(getParquetVersion(format_settings)); -#if USE_SNAPPY - builder.compression(parquet::Compression::SNAPPY); -#endif + builder.compression(getParquetCompression(format_settings.parquet.output_compression_method)); auto props = builder.build(); auto status = parquet::arrow::FileWriter::Open( *arrow_table->schema(), From 4f33e95c96af0b4cf9e1ad9ab3b92d1cfebf72e4 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 1 Mar 2023 21:28:37 +0000 Subject: [PATCH 2/8] Add tests --- ...1_parquet_arrow_orc_compressions.reference | 14 +++++++++++ .../02581_parquet_arrow_orc_compressions.sh | 24 +++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.reference create mode 100755 tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.sh diff --git a/tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.reference b/tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.reference new file mode 100644 index 00000000000..492b12dba56 --- /dev/null +++ b/tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.reference @@ -0,0 +1,14 @@ +10 +10 +10 +10 +10 +10 +10 +10 +10 +10 +10 +10 +10 +10 diff --git a/tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.sh b/tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.sh new file mode 100755 index 00000000000..10850e17bea --- /dev/null +++ b/tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet settings output_format_parquet_compression_method='none'" | $CLICKHOUSE_LOCAL --input-format=Parquet -q "select count() from table" +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet settings output_format_parquet_compression_method='lz4'" | $CLICKHOUSE_LOCAL --input-format=Parquet -q "select count() from table" +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet settings output_format_parquet_compression_method='snappy'" | $CLICKHOUSE_LOCAL --input-format=Parquet -q "select count() from table" +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet settings output_format_parquet_compression_method='zstd'" | $CLICKHOUSE_LOCAL --input-format=Parquet -q "select count() from table" +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet settings output_format_parquet_compression_method='brotli'" | $CLICKHOUSE_LOCAL --input-format=Parquet -q "select count() from table" +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet settings output_format_parquet_compression_method='gzip'" | $CLICKHOUSE_LOCAL --input-format=Parquet -q "select count() from table" + +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format ORC settings output_format_orc_compression_method='none'" | $CLICKHOUSE_LOCAL --input-format=ORC -q "select count() from table" +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format ORC settings output_format_orc_compression_method='lz4'" | $CLICKHOUSE_LOCAL --input-format=ORC -q "select count() from table" +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format ORC settings output_format_orc_compression_method='zstd'" | $CLICKHOUSE_LOCAL --input-format=ORC -q "select count() from table" +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format ORC settings output_format_orc_compression_method='zlib'" | $CLICKHOUSE_LOCAL --input-format=ORC -q "select count() from table" +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format ORC settings output_format_orc_compression_method='snappy'" | $CLICKHOUSE_LOCAL --input-format=ORC -q "select count() from table" + + +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Arrow settings output_format_arrow_compression_method='none'" | $CLICKHOUSE_LOCAL --input-format=Arrow -q "select count() from table" +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Arrow settings output_format_arrow_compression_method='lz4_frame'" | $CLICKHOUSE_LOCAL --input-format=Arrow -q "select count() from table" +$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Arrow settings output_format_arrow_compression_method='zstd'" | $CLICKHOUSE_LOCAL --input-format=Arrow -q "select count() from table" + From a6cf2cdab8e59e8457def1a891fb3a3443918ca3 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 2 Mar 2023 10:36:07 +0000 Subject: [PATCH 3/8] Fix style, add docs --- docs/en/interfaces/formats.md | 3 +++ .../en/operations/settings/settings-formats.md | 18 ++++++++++++++++++ .../Formats/Impl/ArrowBlockOutputFormat.cpp | 2 -- .../Formats/Impl/ORCBlockOutputFormat.cpp | 2 -- .../Formats/Impl/ParquetBlockOutputFormat.cpp | 2 +- .../02581_parquet_arrow_orc_compressions.sh | 1 + 6 files changed, 23 insertions(+), 5 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index b2b2c6d5b1e..1b32de9723f 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1973,6 +1973,7 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t - [input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for Parquet format. Default value - `false`. - [output_format_parquet_fixed_string_as_fixed_byte_array](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_fixed_string_as_fixed_byte_array) - use Parquet FIXED_LENGTH_BYTE_ARRAY type instead of Binary/String for FixedString columns. Default value - `true`. - [output_format_parquet_version](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_version) - The version of Parquet format used in output format. Default value - `2.latest`. +- [output_format_parquet_compression_method](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_compression_method) - compression method used in output Parquet format. Default value - `snappy`. ## Arrow {#data-format-arrow} @@ -2041,6 +2042,7 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Arrow" > {filenam - [input_format_arrow_allow_missing_columns](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_allow_missing_columns) - allow missing columns while reading Arrow data. Default value - `false`. - [input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for Arrow format. Default value - `false`. - [output_format_arrow_fixed_string_as_fixed_byte_array](/docs/en/operations/settings/settings-formats.md/#output_format_arrow_fixed_string_as_fixed_byte_array) - use Arrow FIXED_SIZE_BINARY type instead of Binary/String for FixedString columns. Default value - `true`. +- [output_format_arrow_compression_method](/docs/en/operations/settings/settings-formats.md/#output_format_arrow_compression_method) - compression method used in output Arrow format. Default value - `none`. ## ArrowStream {#data-format-arrow-stream} @@ -2096,6 +2098,7 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename. ### Arrow format settings {#parquet-format-settings} - [output_format_arrow_string_as_string](/docs/en/operations/settings/settings-formats.md/#output_format_arrow_string_as_string) - use Arrow String type instead of Binary for String columns. Default value - `false`. +- [output_format_orc_compression_method](/docs/en/operations/settings/settings-formats.md/#output_format_orc_compression_method) - compression method used in output ORC format. Default value - `none`. - [input_format_arrow_import_nested](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_import_nested) - allow inserting array of structs into Nested table in Arrow input format. Default value - `false`. - [input_format_arrow_case_insensitive_column_matching](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_case_insensitive_column_matching) - ignore case when matching Arrow columns with ClickHouse columns. Default value - `false`. - [input_format_arrow_allow_missing_columns](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_allow_missing_columns) - allow missing columns while reading Arrow data. Default value - `false`. diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 3580d83f704..919ebaf562f 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -1014,6 +1014,12 @@ Use Arrow FIXED_SIZE_BINARY type instead of Binary/String for FixedString column Enabled by default. +### output_format_arrow_compression_method {#output_format_arrow_compression_method} + +Compression method used in output Arrow format. Supported codecs: `lz4_frame`, `zstd`, `none` (uncompressed) + +Default value: `none`. + ## ORC format settings {#orc-format-settings} ### input_format_orc_import_nested {#input_format_orc_import_nested} @@ -1057,6 +1063,12 @@ Use ORC String type instead of Binary for String columns. Disabled by default. +### output_format_orc_compression_method {#output_format_orc_compression_method} + +Compression method used in output ORC format. Supported codecs: `lz4`, `snappy`, `zlib`, `zstd`, `none` (uncompressed) + +Default value: `none`. + ## Parquet format settings {#parquet-format-settings} ### input_format_parquet_import_nested {#input_format_parquet_import_nested} @@ -1112,6 +1124,12 @@ The version of Parquet format used in output format. Supported versions: `1.0`, Default value: `2.latest`. +### output_format_parquet_compression_method {#output_format_parquet_compression_method} + +Compression method used in output Parquet format. Supported codecs: `snappy`, `lz4`, `brotli`, `zstd`, `gzip`, `none` (uncompressed) + +Default value: `snappy`. + ## Hive format settings {#hive-format-settings} ### input_format_hive_text_fields_delimiter {#input_format_hive_text_fields_delimiter} diff --git a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp index ec35c52e37c..c85c0342c8c 100644 --- a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp @@ -8,7 +8,6 @@ #include #include "ArrowBufferedStreams.h" #include "CHColumnToArrowColumn.h" -#include "config.h" namespace DB @@ -36,7 +35,6 @@ arrow::Compression::type getArrowCompression(FormatSettings::ArrowCompression me } - ArrowBlockOutputFormat::ArrowBlockOutputFormat(WriteBuffer & out_, const Block & header_, bool stream_, const FormatSettings & format_settings_) : IOutputFormat(header_, out_) , stream{stream_} diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index ecb7c2fbc92..39cacde94ed 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -22,8 +22,6 @@ #include #include -#include "config.h" - namespace DB { diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index 742912df980..cedd8a9c54c 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -14,6 +14,7 @@ namespace DB namespace ErrorCodes { extern const int UNKNOWN_EXCEPTION; + extern const int NOT_IMPLEMENTED; } namespace @@ -61,7 +62,6 @@ parquet::Compression::type getParquetCompression(FormatSettings::ParquetCompress throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method"); } - } ParquetBlockOutputFormat::ParquetBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_) diff --git a/tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.sh b/tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.sh index 10850e17bea..89b5147f026 100755 --- a/tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.sh +++ b/tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-fasttest CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From d8e5fb519557041dc03c64026775329720da3d2a Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 6 Mar 2023 15:12:15 +0000 Subject: [PATCH 4/8] Use switch, use lz4 by default --- src/Core/Settings.h | 6 ++-- src/Core/SettingsChangesHistory.h | 5 ++- .../Formats/Impl/ORCBlockOutputFormat.cpp | 32 ++++++++--------- .../Formats/Impl/ParquetBlockOutputFormat.cpp | 36 +++++++++---------- 4 files changed, 39 insertions(+), 40 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 8eaecff9708..ae5d5326031 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -858,7 +858,7 @@ class IColumn; M(Bool, output_format_parquet_string_as_string, false, "Use Parquet String type instead of Binary for String columns.", 0) \ M(Bool, output_format_parquet_fixed_string_as_fixed_byte_array, true, "Use Parquet FIXED_LENGTH_BYTE_ARRAY type instead of Binary for FixedString columns.", 0) \ M(ParquetVersion, output_format_parquet_version, "2.latest", "Parquet format version for output format. Supported versions: 1.0, 2.4, 2.6 and 2.latest (default)", 0) \ - M(ParquetCompression, output_format_parquet_compression_method, "snappy", "Compression method for Parquet output format. Supported codecs: snappy, lz4, brotli, zstd, gzip, none (uncompressed)", 0) \ + M(ParquetCompression, output_format_parquet_compression_method, "lz4", "Compression method for Parquet output format. Supported codecs: snappy, lz4, brotli, zstd, gzip, none (uncompressed)", 0) \ M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \ M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \ M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \ @@ -901,10 +901,10 @@ class IColumn; M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \ M(Bool, output_format_arrow_string_as_string, false, "Use Arrow String type instead of Binary for String columns", 0) \ M(Bool, output_format_arrow_fixed_string_as_fixed_byte_array, true, "Use Arrow FIXED_SIZE_BINARY type instead of Binary for FixedString columns.", 0) \ - M(ArrowCompression, output_format_arrow_compression_method, "none", "Compression method for Arrow output format. Supported codecs: lz4_frame, zstd, none (uncompressed)", 0) \ + M(ArrowCompression, output_format_arrow_compression_method, "lz4_frame", "Compression method for Arrow output format. Supported codecs: lz4_frame, zstd, none (uncompressed)", 0) \ \ M(Bool, output_format_orc_string_as_string, false, "Use ORC String type instead of Binary for String columns", 0) \ - M(ORCCompression, output_format_orc_compression_method, "none", "Compression method for ORC output format. Supported codecs: lz4, snappy, zlib, zstd, none (uncompressed)", 0) \ + M(ORCCompression, output_format_orc_compression_method, "lz4", "Compression method for ORC output format. Supported codecs: lz4, snappy, zlib, zstd, none (uncompressed)", 0) \ \ M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 04f328bb665..91b3dff3141 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -81,7 +81,10 @@ namespace SettingsChangesHistory static std::map settings_changes_history = { {"23.3", {{"output_format_parquet_version", "1.0", "2.latest", "Use latest Parquet format version for output format"}, - {"input_format_json_ignore_unknown_keys_in_named_tuple", false, true, "Improve parsing JSON objects as named tuples"}}}, + {"input_format_json_ignore_unknown_keys_in_named_tuple", false, true, "Improve parsing JSON objects as named tuples"}, + {"output_format_arrow_compression_method", "none", "lz4_frame", "Use lz4 compression in Arrow output format by default"}, + {"output_format_parquet_compression_method", "snappy", "lz4", "Use lz4 compression in Parquet output format by default"}, + {"output_format_orc_compression_method", "none", "lz4_frame", "Use lz4 compression in ORC output format by default"}}}, {"23.2", {{"output_format_parquet_fixed_string_as_fixed_byte_array", false, true, "Use Parquet FIXED_LENGTH_BYTE_ARRAY type for FixedString by default"}, {"output_format_arrow_fixed_string_as_fixed_byte_array", false, true, "Use Arrow FIXED_SIZE_BINARY type for FixedString by default"}, {"query_plan_remove_redundant_distinct", false, true, "Remove redundant Distinct step in query plan"}, diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 39cacde94ed..4264e9da4d6 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -23,7 +23,7 @@ #include namespace DB -{ +{7 namespace ErrorCodes { @@ -36,24 +36,22 @@ namespace orc::CompressionKind getORCCompression(FormatSettings::ORCCompression method) { - if (method == FormatSettings::ORCCompression::NONE) - return orc::CompressionKind::CompressionKind_NONE; - + switch (method) + { + case FormatSettings::ORCCompression::NONE: + return orc::CompressionKind::CompressionKind_NONE; + case FormatSettings::ORCCompression::SNAPPY: #if USE_SNAPPY - if (method == FormatSettings::ORCCompression::SNAPPY) - return orc::CompressionKind::CompressionKind_SNAPPY; + return orc::CompressionKind::CompressionKind_SNAPPY; #endif - - if (method == FormatSettings::ORCCompression::ZSTD) - return orc::CompressionKind::CompressionKind_ZSTD; - - if (method == FormatSettings::ORCCompression::LZ4) - return orc::CompressionKind::CompressionKind_LZ4; - - if (method == FormatSettings::ORCCompression::ZLIB) - return orc::CompressionKind::CompressionKind_ZLIB; - - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Snappy compression method is not supported"); + case FormatSettings::ORCCompression::ZSTD: + return orc::CompressionKind::CompressionKind_ZSTD; + case FormatSettings::ORCCompression::LZ4: + return orc::CompressionKind::CompressionKind_LZ4; + case FormatSettings::ORCCompression::ZLIB: + return orc::CompressionKind::CompressionKind_ZLIB; + } } } diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index cedd8a9c54c..3695bb9d110 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -37,29 +37,27 @@ static parquet::ParquetVersion::type getParquetVersion(const FormatSettings & se parquet::Compression::type getParquetCompression(FormatSettings::ParquetCompression method) { - if (method == FormatSettings::ParquetCompression::NONE) - return parquet::Compression::type::UNCOMPRESSED; - + switch (method) + { + case FormatSettings::ParquetCompression::NONE: + return parquet::Compression::type::UNCOMPRESSED; + case FormatSettings::ParquetCompression::SNAPPY: #if USE_SNAPPY - if (method == FormatSettings::ParquetCompression::SNAPPY) - return parquet::Compression::type::SNAPPY; + return parquet::Compression::type::SNAPPY; #endif - + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Snappy compression method is not supported"); + case FormatSettings::ParquetCompression::BROTLI: #if USE_BROTLI - if (method == FormatSettings::ParquetCompression::BROTLI) - return parquet::Compression::type::BROTLI; + return parquet::Compression::type::BROTLI; #endif - - if (method == FormatSettings::ParquetCompression::ZSTD) - return parquet::Compression::type::ZSTD; - - if (method == FormatSettings::ParquetCompression::LZ4) - return parquet::Compression::type::LZ4; - - if (method == FormatSettings::ParquetCompression::GZIP) - return parquet::Compression::type::GZIP; - - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Brotli compression method is not supported"); + case FormatSettings::ParquetCompression::ZSTD: + return parquet::Compression::type::ZSTD; + case FormatSettings::ParquetCompression::LZ4: + return parquet::Compression::type::LZ4; + case FormatSettings::ParquetCompression::GZIP: + return parquet::Compression::type::GZIP; + } } } From df34ab240d19d68226b230413d0ae023d98686f7 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 6 Mar 2023 18:03:17 +0100 Subject: [PATCH 5/8] Don't use switch --- .../Formats/Impl/ORCBlockOutputFormat.cpp | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 4264e9da4d6..7fae2c8f480 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -36,22 +36,24 @@ namespace orc::CompressionKind getORCCompression(FormatSettings::ORCCompression method) { - switch (method) - { - case FormatSettings::ORCCompression::NONE: - return orc::CompressionKind::CompressionKind_NONE; - case FormatSettings::ORCCompression::SNAPPY: + if (method == FormatSettings::ORCCompression::NONE) + return orc::CompressionKind::CompressionKind_NONE; + #if USE_SNAPPY - return orc::CompressionKind::CompressionKind_SNAPPY; + if (method == FormatSettings::ORCCompression::SNAPPY) + return orc::CompressionKind::CompressionKind_SNAPPY; #endif - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Snappy compression method is not supported"); - case FormatSettings::ORCCompression::ZSTD: - return orc::CompressionKind::CompressionKind_ZSTD; - case FormatSettings::ORCCompression::LZ4: - return orc::CompressionKind::CompressionKind_LZ4; - case FormatSettings::ORCCompression::ZLIB: - return orc::CompressionKind::CompressionKind_ZLIB; - } + + if (method == FormatSettings::ORCCompression::ZSTD) + return orc::CompressionKind::CompressionKind_ZSTD; + + if (method == FormatSettings::ORCCompression::LZ4) + return orc::CompressionKind::CompressionKind_LZ4; + + if (method == FormatSettings::ORCCompression::ZLIB) + return orc::CompressionKind::CompressionKind_ZLIB; + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method"); } } From fed645a4945efab6aa9a2b98ac2861de483be158 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 6 Mar 2023 18:03:54 +0100 Subject: [PATCH 6/8] Don't use switch --- .../Formats/Impl/ParquetBlockOutputFormat.cpp | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index 3695bb9d110..cedd8a9c54c 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -37,27 +37,29 @@ static parquet::ParquetVersion::type getParquetVersion(const FormatSettings & se parquet::Compression::type getParquetCompression(FormatSettings::ParquetCompression method) { - switch (method) - { - case FormatSettings::ParquetCompression::NONE: - return parquet::Compression::type::UNCOMPRESSED; - case FormatSettings::ParquetCompression::SNAPPY: + if (method == FormatSettings::ParquetCompression::NONE) + return parquet::Compression::type::UNCOMPRESSED; + #if USE_SNAPPY - return parquet::Compression::type::SNAPPY; + if (method == FormatSettings::ParquetCompression::SNAPPY) + return parquet::Compression::type::SNAPPY; #endif - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Snappy compression method is not supported"); - case FormatSettings::ParquetCompression::BROTLI: + #if USE_BROTLI - return parquet::Compression::type::BROTLI; + if (method == FormatSettings::ParquetCompression::BROTLI) + return parquet::Compression::type::BROTLI; #endif - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Brotli compression method is not supported"); - case FormatSettings::ParquetCompression::ZSTD: - return parquet::Compression::type::ZSTD; - case FormatSettings::ParquetCompression::LZ4: - return parquet::Compression::type::LZ4; - case FormatSettings::ParquetCompression::GZIP: - return parquet::Compression::type::GZIP; - } + + if (method == FormatSettings::ParquetCompression::ZSTD) + return parquet::Compression::type::ZSTD; + + if (method == FormatSettings::ParquetCompression::LZ4) + return parquet::Compression::type::LZ4; + + if (method == FormatSettings::ParquetCompression::GZIP) + return parquet::Compression::type::GZIP; + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method"); } } From ba0cc954415275380c93f51c145b6ed479388aa3 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 7 Mar 2023 20:04:52 +0100 Subject: [PATCH 7/8] Fix typo --- src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 7fae2c8f480..39cacde94ed 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -23,7 +23,7 @@ #include namespace DB -{7 +{ namespace ErrorCodes { From 79ef95bc0c2c2e983cdbb90afc75177aa8597430 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 8 Mar 2023 12:54:22 +0000 Subject: [PATCH 8/8] Fix tests and builds --- .../Formats/Impl/ParquetBlockOutputFormat.cpp | 2 +- .../01308_orc_output_format_arrays.reference | Bin 567 -> 36 bytes .../01308_orc_output_format_arrays.sh | 2 +- .../queries/0_stateless/02426_orc_bug.reference | Bin 312 -> 36 bytes tests/queries/0_stateless/02426_orc_bug.sh | 9 +++++++++ tests/queries/0_stateless/02426_orc_bug.sql | 3 --- 6 files changed, 11 insertions(+), 5 deletions(-) create mode 100755 tests/queries/0_stateless/02426_orc_bug.sh delete mode 100644 tests/queries/0_stateless/02426_orc_bug.sql diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index cedd8a9c54c..759f773a574 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -20,7 +20,7 @@ namespace ErrorCodes namespace { -static parquet::ParquetVersion::type getParquetVersion(const FormatSettings & settings) +parquet::ParquetVersion::type getParquetVersion(const FormatSettings & settings) { switch (settings.parquet.output_version) { diff --git a/tests/queries/0_stateless/01308_orc_output_format_arrays.reference b/tests/queries/0_stateless/01308_orc_output_format_arrays.reference index 1f9646ac112132378f512bb4e3a610f6019698e1..7feea7cec35600166ea51a6950459c05eb6c7d88 100644 GIT binary patch literal 36 rcmXpqH%qZFO-ZpdOEI=INij9EFfubvv`jTIH%c@%Pfk%#(B%REtfL6? literal 567 zcmaKou}Z{15Qb-Fv+J_CUfj1(^yJ73Txpxm?(hk$eF1A9K)8pp z5ER#OZ*wFHHuLYd|I9G_i{;#AZq?1nOH zi@h-PB)k2_Z@x78PeSsIycPNP4-!J}r2;h0Sd#;1L6P@^5+4Osk&~#JS^h!#DJTY| z99adXz?c*3z$*GG`bp%3P4IQ>*K8yV+53Mj)#Y{L&fPY*Z5^i6IX{^Uqp&J}r7{({ zSCX4ST<`_rgrW2m5pNorkz6v0~6|4#a#fA`ch4R^#kxNzslh121M bv&Wk&w9hE%uGh(gAgwNH^YEtBVmaq;55+tO diff --git a/tests/queries/0_stateless/01308_orc_output_format_arrays.sh b/tests/queries/0_stateless/01308_orc_output_format_arrays.sh index 1d9aea353b6..498854874cf 100755 --- a/tests/queries/0_stateless/01308_orc_output_format_arrays.sh +++ b/tests/queries/0_stateless/01308_orc_output_format_arrays.sh @@ -11,7 +11,7 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE orc (array1 Array(Int32), array2 Array( $CLICKHOUSE_CLIENT --query="INSERT INTO orc VALUES ([1,2,3,4,5], [[1,2], [3,4], [5]]), ([42], [[42, 42], [42]])"; -$CLICKHOUSE_CLIENT --query="SELECT * FROM orc FORMAT ORC"; +$CLICKHOUSE_CLIENT --query="SELECT * FROM orc FORMAT ORC SETTINGS output_format_orc_compression_method='none'" | md5sum; $CLICKHOUSE_CLIENT --query="DROP TABLE orc"; diff --git a/tests/queries/0_stateless/02426_orc_bug.reference b/tests/queries/0_stateless/02426_orc_bug.reference index e5ad2b492892318e8fa83954efdecbb2bf287a91..baa88da215881130acc288c7ce2a15bd34f45e90 100644 GIT binary patch literal 36 rcmcCawlFudut-WXGfYc0H%zs(G)*xwOi4*gGB-9%F*jFG(B%REwulK^ literal 312 zcmeYda^~XZVrF1q5MtqA3Si*ke3ufJp+V>i>cN2mUcI{%`ot z$i>FNCcq%U3#J(*_`x)jggB690E#n#X+|*3B)}-a4CVt30n&^h!+_QT)ghBo%