Merge pull request #66025 from KevinyhZou/support_orc_reader_timezone

Support ORC file read by writer time zone
This commit is contained in:
Kruglov Pavel 2024-07-09 08:00:20 +00:00 committed by GitHub
commit aaf5412c71
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 22 additions and 0 deletions

View File

@ -1062,6 +1062,7 @@ class IColumn;
M(Bool, input_format_orc_allow_missing_columns, true, "Allow missing columns while reading ORC input formats", 0) \
M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \
M(Bool, input_format_orc_filter_push_down, true, "When reading ORC files, skip whole stripes or row groups based on the WHERE/PREWHERE expressions, min/max statistics or bloom filter in the ORC metadata.", 0) \
M(Bool, input_format_orc_read_use_writer_time_zone, false, "Whether use the writer's time zone in ORC stripe for ORC row reader, the default ORC row reader's time zone is GMT.", 0) \
M(Bool, input_format_parquet_allow_missing_columns, true, "Allow missing columns while reading Parquet input formats", 0) \
M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \
M(Bool, input_format_arrow_allow_missing_columns, true, "Allow missing columns while reading Arrow input formats", 0) \

View File

@ -61,6 +61,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"optimize_functions_to_subcolumns", false, true, "Enable optimization by default"},
{"input_format_json_ignore_key_case", false, false, "Ignore json key case while read json field from string."},
{"optimize_trivial_insert_select", true, false, "The optimization does not make sense in many cases."},
{"input_format_orc_read_use_writer_time_zone", false, false, "Whether use the writer's time zone in ORC stripe for ORC row reader, the default ORC row reader's time zone is GMT."},
{"lightweight_mutation_projection_mode", "throw", "throw", "When lightweight delete happens on a table with projection(s), the possible operations include throw the exception as projection exists, or drop all projection related to this table then do lightweight delete."},
{"database_replicated_allow_heavy_create", true, false, "Long-running DDL queries (CREATE AS SELECT and POPULATE) for Replicated database engine was forbidden"},
{"query_plan_merge_filters", false, false, "Allow to merge filters in the query plan"},

View File

@ -243,6 +243,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.orc.output_row_index_stride = settings.output_format_orc_row_index_stride;
format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder;
format_settings.orc.filter_push_down = settings.input_format_orc_filter_push_down;
format_settings.orc.read_use_writer_time_zone = settings.input_format_orc_read_use_writer_time_zone;
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference;

View File

@ -403,6 +403,7 @@ struct FormatSettings
bool use_fast_decoder = true;
bool filter_push_down = true;
UInt64 output_row_index_stride = 10'000;
bool read_use_writer_time_zone = false;
} orc{};
/// For capnProto format we should determine how to

View File

@ -900,6 +900,11 @@ bool NativeORCBlockInputFormat::prepareStripeReader()
orc::RowReaderOptions row_reader_options;
row_reader_options.includeTypes(include_indices);
if (format_settings.orc.read_use_writer_time_zone)
{
String writer_time_zone = current_stripe_info->getWriterTimezone();
row_reader_options.setTimezoneName(writer_time_zone);
}
row_reader_options.range(current_stripe_info->getOffset(), current_stripe_info->getLength());
if (format_settings.orc.filter_push_down && sarg)
{

View File

@ -0,0 +1 @@
1 2024-06-30 20:00:00.000

View File

@ -0,0 +1,12 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "drop table if exists test"
$CLICKHOUSE_CLIENT -q "create table test(id UInt64, t DateTime64) Engine=MergeTree order by id"
$CLICKHOUSE_CLIENT -q "insert into test from infile '$CURDIR/data_orc/test_reader_time_zone.snappy.orc' SETTINGS input_format_orc_read_use_writer_time_zone=true FORMAT ORC"
$CLICKHOUSE_CLIENT -q "select * from test SETTINGS session_timezone='Asia/Shanghai'"
$CLICKHOUSE_CLIENT -q "drop table test"