From 03ab6252653ddcb70d1fef0a340bfb52a8d63f89 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Tue, 20 Aug 2024 15:47:26 +0800 Subject: [PATCH] enable string dict encoding in orc output format --- src/Core/Settings.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + 3 files changed, 3 insertions(+) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index d8837d26e54..3c507bc064f 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1265,6 +1265,7 @@ class IColumn; M(Bool, output_format_orc_string_as_string, true, "Use ORC String type instead of Binary for String columns", 0) \ M(ORCCompression, output_format_orc_compression_method, "zstd", "Compression method for ORC output format. Supported codecs: lz4, snappy, zlib, zstd, none (uncompressed)", 0) \ M(UInt64, output_format_orc_row_index_stride, 10'000, "Target row index stride in ORC output format", 0) \ + M(Double, output_format_orc_dictionary_key_size_threshold, 0.0, "For a string column in ORC output format, if the number of distinct values is greater than this fraction of the total number of non-null rows, turn off dictionary encoding. Otherwise dictionary encoding is enabled", 0) \ \ M(CapnProtoEnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::CapnProtoEnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 865b6e6f3f1..479d7a3f029 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -243,6 +243,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string; format_settings.orc.output_compression_method = settings.output_format_orc_compression_method; format_settings.orc.output_row_index_stride = settings.output_format_orc_row_index_stride; + format_settings.orc.output_dictionary_key_size_threshold = settings.output_format_orc_dictionary_key_size_threshold; format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder; format_settings.orc.filter_push_down = settings.input_format_orc_filter_push_down; format_settings.orc.reader_time_zone_name = settings.input_format_orc_reader_time_zone_name; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index da25da74134..06535bc07a3 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -413,6 +413,7 @@ struct FormatSettings bool filter_push_down = true; UInt64 output_row_index_stride = 10'000; String reader_time_zone_name = "GMT"; + double output_dictionary_key_size_threshold = 0.0; } orc{}; /// For capnProto format we should determine how to