From 471dff6589abff5d05ab8a9bb267e198f377c536 Mon Sep 17 00:00:00 2001 From: copperybean Date: Sun, 25 Feb 2024 14:26:53 +0800 Subject: [PATCH] fix test Change-Id: Ia7dbf1d762f7f054a9aa677caaaff6bfe1a42c38 --- src/Core/SettingsChangesHistory.h | 1 + .../Formats/Impl/Parquet/ParquetDataBuffer.h | 13 +++++-------- .../Impl/Parquet/ParquetDataValuesReader.cpp | 2 +- .../Formats/Impl/Parquet/ParquetDataValuesReader.h | 4 ++-- .../Formats/Impl/Parquet/ParquetLeafColReader.cpp | 6 +++--- .../Formats/Impl/Parquet/ParquetRecordReader.cpp | 7 ++----- .../Formats/Impl/ParquetBlockInputFormat.cpp | 8 ++++++++ .../0_stateless/02998_native_parquet_reader.sh | 5 +++-- 8 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index ece48620618..6fb8fb9358c 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -176,6 +176,7 @@ static std::map sett {"default_normal_view_sql_security", "INVOKER", "INVOKER", "Allows to set default `SQL SECURITY` option while creating a normal view"}, {"mysql_map_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, {"mysql_map_fixed_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, + {"input_format_parquet_use_native_reader", false, false, "When reading Parquet files, to use native reader instead of arrow reader."}, }}, {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h index f21216d5b5d..5c37375fa0c 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h @@ -38,15 +38,13 @@ public: template void ALWAYS_INLINE readValue(TValue & dst) { - checkAvaible(sizeof(TValue)); - dst = *(reinterpret_cast(data)); - consume(sizeof(TValue)); + readBytes(&dst, sizeof(TValue)); } void ALWAYS_INLINE readBytes(void * dst, size_t bytes) { checkAvaible(bytes); - memcpy(dst, data, bytes); + std::copy(data, data + bytes, reinterpret_cast(dst)); consume(bytes); } @@ -68,13 +66,12 @@ public: 100000000 * spd, 1000000000 * spd}; - checkAvaible(sizeof(parquet::Int96)); - auto decoded = parquet::DecodeInt96Timestamp(*reinterpret_cast(data)); + parquet::Int96 tmp; + readValue(tmp); + auto decoded = parquet::DecodeInt96Timestamp(tmp); uint64_t scaled_nano = decoded.nanoseconds / pow10[datetime64_scale]; dst = static_cast(decoded.days_since_epoch * scaled_day[datetime64_scale] + scaled_nano); - - consume(sizeof(parquet::Int96)); } /** diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index 4ebe3d6a636..6743086e9e6 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -306,7 +306,7 @@ void ParquetPlainValuesReader>::readBatch( }, /* repeated_visitor */ [&](size_t nest_cursor, UInt32 count) { - auto col_data_pos = column_data + nest_cursor; + auto * col_data_pos = column_data + nest_cursor; for (UInt32 i = 0; i < count; i++) { plain_data_buffer.readDateTime64(col_data_pos[i]); diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h index 8bc381aa8d2..688de4f52eb 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h @@ -97,7 +97,7 @@ public: * @tparam ValueGetter A callback with signature: TValue(Int32 val) */ template - void setValues(TValue * column_data, UInt32 num_values, ValueGetter && val_getter); + void setValues(TValue * res_values, UInt32 num_values, ValueGetter && val_getter); /** * @brief Set the value by valid_index_steps generated in visitNullableBySteps. @@ -106,7 +106,7 @@ public: */ template void setValueBySteps( - TValue * column_data, + TValue * res_values, const std::vector & col_data_steps, ValueGetter && val_getter); diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp index 17feea80b9f..52dfad7606a 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp @@ -113,7 +113,7 @@ ColumnPtr readDictPage>( const parquet::ColumnDescriptor & /* col_des */, const DataTypePtr & data_type) { - auto & datetime_type = assert_cast(*data_type); + const auto & datetime_type = assert_cast(*data_type); auto dict_col = ColumnDecimal::create(page.num_values(), datetime_type.getScale()); auto * col_data = dict_col->getData().data(); ParquetDataBuffer buffer(page.data(), page.size(), datetime_type.getScale()); @@ -282,7 +282,7 @@ void ParquetLeafColReader::degradeDictionary() dictionary = nullptr; return; } - assert(dictionary && column->size()); + assert(dictionary && !column->empty()); null_map = std::make_unique(reading_rows_num); auto col_existing = std::move(column); @@ -372,7 +372,7 @@ void ParquetLeafColReader::readPage() dict_page.encoding() != parquet::Encoding::PLAIN_DICTIONARY && dict_page.encoding() != parquet::Encoding::PLAIN)) { - throw new Exception( + throw Exception( ErrorCodes::NOT_IMPLEMENTED, "Unsupported dictionary page encoding {}", dict_page.encoding()); } LOG_DEBUG(log, "{} values in dictionary page of column {}", dict_page.num_values(), col_descriptor.name()); diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index 69e694a340f..9cde433b983 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -156,9 +156,6 @@ ParquetRecordReader::ParquetRecordReader( , row_groups_indices(std::move(row_groups_indices_)) , left_rows(getTotalRows(*file_reader->metadata())) { - // Only little endian system is supported currently - static_assert(std::endian::native == std::endian::little); - log = &Poco::Logger::get("ParquetRecordReader"); parquet_col_indice.reserve(header.columns()); @@ -230,9 +227,9 @@ void ParquetRecordReader::loadNextRowGroup() Int64 ParquetRecordReader::getTotalRows(const parquet::FileMetaData & meta_data) { Int64 res = 0; - for (size_t i = 0; i < row_groups_indices.size(); i++) + for (auto idx : row_groups_indices) { - res += meta_data.RowGroup(row_groups_indices[i])->num_rows(); + res += meta_data.RowGroup(idx)->num_rows(); } return res; } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index e35d53dc4f4..2e849f09fda 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -484,6 +484,14 @@ void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_bat if (format_settings.parquet.use_native_reader) { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunreachable-code" + if constexpr (std::endian::native != std::endian::little) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "parquet native reader only supports little endian system currently"); +#pragma clang diagnostic pop + row_group_batch.native_record_reader = std::make_shared( getPort().getHeader(), std::move(properties), diff --git a/tests/queries/0_stateless/02998_native_parquet_reader.sh b/tests/queries/0_stateless/02998_native_parquet_reader.sh index 5c129e6c5ce..4e5169c4bf0 100755 --- a/tests/queries/0_stateless/02998_native_parquet_reader.sh +++ b/tests/queries/0_stateless/02998_native_parquet_reader.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-fasttest CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -201,8 +202,8 @@ CH_SCHEMA="\ plain_encoding_str Nullable(String), \ mix_encoding_str Nullable(String), \ dict_encoding_str LowCardinality(Nullable(String)), \ - plain_encoding_dt64 Nullable(DateTime64(9)), \ - dict_encoding_dt64 Nullable(DateTime64(9)), \ + plain_encoding_dt64 Nullable(DateTime64(9, \\'UTC\\')), \ + dict_encoding_dt64 Nullable(DateTime64(9, \\'UTC\\')), \ plain_encoding_decimal128 Nullable(Decimal(38, 3))" QUERY="SELECT * from file('$PAR_PATH', 'Parquet', '$CH_SCHEMA')"