From 92a1f0c562a1e051fd194831a3578280851e29a9 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 20 Nov 2024 10:18:07 -0300 Subject: [PATCH 1/3] approach1 --- .../Formats/Impl/Parquet/ParquetDataBuffer.h | 16 ++ .../Impl/Parquet/ParquetDataValuesReader.cpp | 41 ++-- .../Impl/Parquet/ParquetDataValuesReader.h | 46 +++- .../Impl/Parquet/ParquetLeafColReader.cpp | 222 ++++++++++-------- .../Impl/Parquet/ParquetLeafColReader.h | 5 +- .../Impl/Parquet/ParquetRecordReader.cpp | 57 +++-- ...t_native_reader_int_logical_type.reference | 0 ..._parquet_native_reader_int_logical_type.sh | 21 ++ 8 files changed, 265 insertions(+), 143 deletions(-) create mode 100644 tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.reference create mode 100644 tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h index 57df6f59f72..8f0b39c917c 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h @@ -48,6 +48,22 @@ public: consume(bytes); } + template + void ALWAYS_INLINE readValuesOfDifferentSize(TValue * dst, size_t count) + { + auto necessary_bytes = count * sizeof(ParquetType); + checkAvaible(necessary_bytes); + + const ParquetType* src = reinterpret_cast(data); + + for (std::size_t i = 0; i < count; i++) + { + dst[i] = static_cast(src[i]); + } + + consume(necessary_bytes); + } + void ALWAYS_INLINE readDateTime64FromInt96(DateTime64 & dst) { static const int max_scale_num = 9; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index b471989076b..758627f833d 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -240,8 +240,8 @@ TValue * getResizedPrimitiveData(TColumn & column, size_t size) } // anoynomous namespace -template <> -void ParquetPlainValuesReader::readBatch( +template +void ParquetPlainByteArrayValuesReader::readBatch( MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) { auto & column = *assert_cast(col_ptr.get()); @@ -322,8 +322,8 @@ void ParquetBitPlainReader::readBatch( } -template <> -void ParquetPlainValuesReader, ParquetReaderTypes::TimestampInt96>::readBatch( +template +void ParquetPlainInt96ValuesReader::readBatch( MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) { auto cursor = col_ptr->size(); @@ -350,8 +350,8 @@ void ParquetPlainValuesReader, ParquetReaderTypes::Tim ); } -template -void ParquetPlainValuesReader::readBatch( +template +void ParquetPlainValuesReader::readBatch( MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) { auto cursor = col_ptr->size(); @@ -576,17 +576,19 @@ void ParquetRleDictReader::readBatch( } -template class ParquetPlainValuesReader; -template class ParquetPlainValuesReader; -template class ParquetPlainValuesReader; -template class ParquetPlainValuesReader; -template class ParquetPlainValuesReader; -template class ParquetPlainValuesReader; -template class ParquetPlainValuesReader>; -template class ParquetPlainValuesReader>; -template class ParquetPlainValuesReader>; -template class ParquetPlainValuesReader; -template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader, int32_t>; +template class ParquetPlainValuesReader, int64_t>; +template class ParquetPlainValuesReader, int64_t>; template class ParquetBitPlainReader; @@ -597,7 +599,6 @@ template class ParquetRleLCReader; template class ParquetRleLCReader; template class ParquetRleLCReader; -template class ParquetRleDictReader; template class ParquetRleDictReader; template class ParquetRleDictReader; template class ParquetRleDictReader; @@ -611,4 +612,8 @@ template class ParquetRleDictReader>; template class ParquetRleDictReader>; template class ParquetRleDictReader; +template class ParquetPlainByteArrayValuesReader; + +template class ParquetPlainInt96ValuesReader>; + } diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h index db55f7e2d6a..4ffc7c6629f 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h @@ -150,7 +150,7 @@ enum class ParquetReaderTypes /** * The definition level is RLE or BitPacked encoding, while data is read directly */ -template +template class ParquetPlainValuesReader : public ParquetDataValuesReader { public: @@ -172,6 +172,50 @@ private: ParquetDataBuffer plain_data_buffer; }; +template +class ParquetPlainInt96ValuesReader : public ParquetDataValuesReader +{ +public: + + ParquetPlainInt96ValuesReader( + Int32 max_def_level_, + std::unique_ptr def_level_reader_, + ParquetDataBuffer data_buffer_) + : max_def_level(max_def_level_) + , def_level_reader(std::move(def_level_reader_)) + , plain_data_buffer(std::move(data_buffer_)) + {} + + void readBatch(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) override; + +private: + Int32 max_def_level; + std::unique_ptr def_level_reader; + ParquetDataBuffer plain_data_buffer; +}; + +template +class ParquetPlainByteArrayValuesReader : public ParquetDataValuesReader +{ +public: + + ParquetPlainByteArrayValuesReader( + Int32 max_def_level_, + std::unique_ptr def_level_reader_, + ParquetDataBuffer data_buffer_) + : max_def_level(max_def_level_) + , def_level_reader(std::move(def_level_reader_)) + , plain_data_buffer(std::move(data_buffer_)) + {} + + void readBatch(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) override; + +private: + Int32 max_def_level; + std::unique_ptr def_level_reader; + ParquetDataBuffer plain_data_buffer; +}; + template class ParquetBitPlainReader : public ParquetDataValuesReader { diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp index c3c7db510ed..d7f0c65d377 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp @@ -173,13 +173,7 @@ ColumnPtr readDictPage( } -template -std::unique_ptr createPlainReader( - const parquet::ColumnDescriptor & col_des, - RleValuesReaderPtr def_level_reader, - ParquetDataBuffer buffer); - -template +template std::unique_ptr createPlainReader( const parquet::ColumnDescriptor & col_des, RleValuesReaderPtr def_level_reader, @@ -192,25 +186,62 @@ std::unique_ptr createPlainReader( std::move(buffer)); } -template + +template std::unique_ptr createPlainReader( const parquet::ColumnDescriptor & col_des, RleValuesReaderPtr def_level_reader, ParquetDataBuffer buffer) { - if (std::is_same_v> && col_des.physical_type() == parquet::Type::INT96) - return std::make_unique>( + if constexpr (std::is_same_v> && std::is_same_v) + return std::make_unique>( col_des.max_definition_level(), std::move(def_level_reader), std::move(buffer)); - return std::make_unique>( + + if constexpr (std::is_same_v) + { + return std::make_unique>( + col_des.max_definition_level(), std::move(def_level_reader), std::move(buffer)); + } + + return std::make_unique>( col_des.max_definition_level(), std::move(def_level_reader), std::move(buffer)); } +template +std::unique_ptr createReader( + const parquet::ColumnDescriptor & col_descriptor, + RleValuesReaderPtr def_level_reader, + const uint8_t * buffer, + std::size_t buffer_max_size, + const DataTypePtr & base_data_type) +{ + if constexpr (std::is_same_v) + { + auto bit_reader = std::make_unique(buffer, buffer_max_size); + return std::make_unique>( + col_descriptor.max_definition_level(), std::move(def_level_reader), std::move(bit_reader)); + } + else + { + ParquetDataBuffer parquet_buffer = [&]() + { + if constexpr (!std::is_same_v, TColumn>) + return ParquetDataBuffer(buffer, buffer_max_size); + + auto scale = assert_cast(*base_data_type).getScale(); + return ParquetDataBuffer(buffer, buffer_max_size, scale); + }(); + + return createPlainReader(col_descriptor, std::move(def_level_reader), parquet_buffer); + } +} + } // anonymous namespace -template -ParquetLeafColReader::ParquetLeafColReader( +template +ParquetLeafColReader::ParquetLeafColReader( const parquet::ColumnDescriptor & col_descriptor_, DataTypePtr base_type_, std::unique_ptr meta_, @@ -223,8 +254,8 @@ ParquetLeafColReader::ParquetLeafColReader( { } -template -ColumnWithTypeAndName ParquetLeafColReader::readBatch(UInt64 rows_num, const String & name) +template +ColumnWithTypeAndName ParquetLeafColReader::readBatch(UInt64 rows_num, const String & name) { reading_rows_num = rows_num; auto readPageIfEmpty = [&]() @@ -251,41 +282,42 @@ ColumnWithTypeAndName ParquetLeafColReader::readBatch(UInt64 rows_num, return releaseColumn(name); } -template <> -void ParquetLeafColReader::resetColumn(UInt64 rows_num) +template +void ParquetLeafColReader::resetColumn(UInt64 rows_num) { - if (reading_low_cardinality) + if constexpr (std::is_same_v) { - assert(dictionary); - visitColStrIndexType(dictionary->size(), [&](TColVec *) + if (reading_low_cardinality) { - column = TColVec::create(); - }); + assert(dictionary); + visitColStrIndexType(dictionary->size(), [&](TColVec *) + { + column = TColVec::create(); + }); - // only first position is used - null_map = std::make_unique(1); - column->reserve(rows_num); + // only first position is used + null_map = std::make_unique(1); + column->reserve(rows_num); + } + else + { + null_map = std::make_unique(rows_num); + column = ColumnString::create(); + reserveColumnStrRows(column, rows_num); + } } else { + assert(!reading_low_cardinality); + + column = base_data_type->createColumn(); + column->reserve(rows_num); null_map = std::make_unique(rows_num); - column = ColumnString::create(); - reserveColumnStrRows(column, rows_num); } } -template -void ParquetLeafColReader::resetColumn(UInt64 rows_num) -{ - assert(!reading_low_cardinality); - - column = base_data_type->createColumn(); - column->reserve(rows_num); - null_map = std::make_unique(rows_num); -} - -template -void ParquetLeafColReader::degradeDictionary() +template +void ParquetLeafColReader::degradeDictionary() { // if last batch read all dictionary indices, then degrade is not needed this time if (!column) @@ -331,8 +363,8 @@ void ParquetLeafColReader::degradeDictionary() LOG_DEBUG(log, "degraded dictionary to normal column"); } -template -ColumnWithTypeAndName ParquetLeafColReader::releaseColumn(const String & name) +template +ColumnWithTypeAndName ParquetLeafColReader::releaseColumn(const String & name) { DataTypePtr data_type = base_data_type; if (reading_low_cardinality) @@ -365,8 +397,8 @@ ColumnWithTypeAndName ParquetLeafColReader::releaseColumn(const String return res; } -template -void ParquetLeafColReader::readPage() +template +void ParquetLeafColReader::readPage() { // refer to: ColumnReaderImplBase::ReadNewPage in column_reader.cc // this is where decompression happens @@ -408,8 +440,8 @@ void ParquetLeafColReader::readPage() } } -template -void ParquetLeafColReader::initDataReader( +template +void ParquetLeafColReader::initDataReader( parquet::Encoding::type enconding_type, const uint8_t * buffer, std::size_t max_size, @@ -425,29 +457,8 @@ void ParquetLeafColReader::initDataReader( degradeDictionary(); } - if (col_descriptor.physical_type() == parquet::Type::BOOLEAN) - { - if constexpr (std::is_same_v) - { - auto bit_reader = std::make_unique(buffer, max_size); - data_values_reader = std::make_unique>(col_descriptor.max_definition_level(), - std::move(def_level_reader), - std::move(bit_reader)); - } - } - else - { - ParquetDataBuffer parquet_buffer = [&]() - { - if constexpr (!std::is_same_v, TColumn>) - return ParquetDataBuffer(buffer, max_size); - - auto scale = assert_cast(*base_data_type).getScale(); - return ParquetDataBuffer(buffer, max_size, scale); - }(); - data_values_reader = createPlainReader( - col_descriptor, std::move(def_level_reader), std::move(parquet_buffer)); - } + data_values_reader = createReader( + col_descriptor, std::move(def_level_reader), buffer, max_size, base_data_type); break; } case parquet::Encoding::RLE_DICTIONARY: @@ -476,8 +487,8 @@ void ParquetLeafColReader::initDataReader( } } -template -void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) +template +void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) { cur_page_values = page.num_values(); @@ -562,8 +573,8 @@ void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) * The data buffer is "offset-ed" by rl bytes length and then dl decoder is built using RLE decoder. Since dl bytes length was present in the header, * there is no need to read it and apply an offset like in page v1. * */ -template -void ParquetLeafColReader::readPageV2(const parquet::DataPageV2 & page) +template +void ParquetLeafColReader::readPageV2(const parquet::DataPageV2 & page) { cur_page_values = page.num_values(); @@ -609,28 +620,32 @@ void ParquetLeafColReader::readPageV2(const parquet::DataPageV2 & page) initDataReader(page.encoding(), buffer, page.size() - total_levels_length, std::move(def_level_reader)); } -template -std::unique_ptr ParquetLeafColReader::createDictReader( +template +std::unique_ptr ParquetLeafColReader::createDictReader( std::unique_ptr def_level_reader, std::unique_ptr rle_data_reader) { - if (reading_low_cardinality && std::same_as) - { - std::unique_ptr res; - visitColStrIndexType(dictionary->size(), [&](TCol *) - { - res = std::make_unique>( - col_descriptor.max_definition_level(), - std::move(def_level_reader), - std::move(rle_data_reader)); - }); - return res; - } - - if (col_descriptor.physical_type() == parquet::Type::type::BOOLEAN) + if constexpr (std::is_same_v || std::is_same_v + || std::is_same_v || std::is_same_v) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Dictionary encoding for booleans is not supported"); } + if (reading_low_cardinality) + { + if constexpr (std::same_as) + { + std::unique_ptr res; + visitColStrIndexType(dictionary->size(), [&](TCol *) + { + res = std::make_unique>( + col_descriptor.max_definition_level(), + std::move(def_level_reader), + std::move(rle_data_reader)); + }); + return res; + } + } + return std::make_unique>( col_descriptor.max_definition_level(), std::move(def_level_reader), @@ -639,18 +654,23 @@ std::unique_ptr ParquetLeafColReader::createDi } -template class ParquetLeafColReader; -template class ParquetLeafColReader; -template class ParquetLeafColReader; -template class ParquetLeafColReader; -template class ParquetLeafColReader; -template class ParquetLeafColReader; -template class ParquetLeafColReader; -template class ParquetLeafColReader; -template class ParquetLeafColReader>; -template class ParquetLeafColReader>; -template class ParquetLeafColReader>; -template class ParquetLeafColReader>; -template class ParquetLeafColReader>; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader, int32_t>; +template class ParquetLeafColReader, int64_t>; +template class ParquetLeafColReader, ParquetByteArrayTypeStub>; +template class ParquetLeafColReader, ParquetByteArrayTypeStub>; +template class ParquetLeafColReader, ParquetInt96TypeStub>; +template class ParquetLeafColReader, int64_t>; } diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.h b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.h index e1eb7702def..395114c282d 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.h @@ -17,7 +17,10 @@ class ColumnDescriptor; namespace DB { -template +struct ParquetByteArrayTypeStub {}; +struct ParquetInt96TypeStub {}; + +template class ParquetLeafColReader : public ParquetColumnReader { public: diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index 971bb9e1be5..deaa7459dc5 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -93,19 +93,20 @@ private: std::unique_ptr fromInt32INT(const parquet::IntLogicalType & int_type); std::unique_ptr fromInt64INT(const parquet::IntLogicalType & int_type); - template + template auto makeLeafReader() { - return std::make_unique>( - col_descriptor, std::make_shared(), std::move(meta), std::move(page_reader)); + return std::make_unique>( + col_descriptor, std::make_shared(), std::move(meta), std::move(page_reader)); } - template + template auto makeDecimalLeafReader() { auto data_type = std::make_shared>( col_descriptor.type_precision(), col_descriptor.type_scale()); - return std::make_unique>>( + + return std::make_unique, ParquetType>>( col_descriptor, std::move(data_type), std::move(meta), std::move(page_reader)); } @@ -157,11 +158,11 @@ std::unique_ptr ColReaderFactory::fromInt32() case parquet::LogicalType::Type::INT: return fromInt32INT(dynamic_cast(*col_descriptor.logical_type())); case parquet::LogicalType::Type::NONE: - return makeLeafReader(); + return makeLeafReader(); case parquet::LogicalType::Type::DATE: - return makeLeafReader(); + return makeLeafReader(); case parquet::LogicalType::Type::DECIMAL: - return makeDecimalLeafReader(); + return makeDecimalLeafReader(); default: return throwUnsupported(); } @@ -174,16 +175,16 @@ std::unique_ptr ColReaderFactory::fromInt64() case parquet::LogicalType::Type::INT: return fromInt64INT(dynamic_cast(*col_descriptor.logical_type())); case parquet::LogicalType::Type::NONE: - return makeLeafReader(); + return makeLeafReader(); case parquet::LogicalType::Type::TIMESTAMP: { const auto & tm_type = dynamic_cast(*col_descriptor.logical_type()); auto read_type = std::make_shared(getScaleFromLogicalTimestamp(tm_type.time_unit())); - return std::make_unique>>( + return std::make_unique, int64_t>>( col_descriptor, std::move(read_type), std::move(meta), std::move(page_reader)); } case parquet::LogicalType::Type::DECIMAL: - return makeDecimalLeafReader(); + return makeDecimalLeafReader(); default: return throwUnsupported(); } @@ -195,7 +196,7 @@ std::unique_ptr ColReaderFactory::fromByteArray() { case parquet::LogicalType::Type::STRING: case parquet::LogicalType::Type::NONE: - return makeLeafReader(); + return makeLeafReader(); default: return throwUnsupported(); } @@ -210,9 +211,9 @@ std::unique_ptr ColReaderFactory::fromFLBA() if (col_descriptor.type_length() > 0) { if (col_descriptor.type_length() <= static_cast(sizeof(Decimal128))) - return makeDecimalLeafReader(); + return makeDecimalLeafReader(); if (col_descriptor.type_length() <= static_cast(sizeof(Decimal256))) - return makeDecimalLeafReader(); + return makeDecimalLeafReader(); } return throwUnsupported(PreformattedMessage::create( @@ -227,11 +228,23 @@ std::unique_ptr ColReaderFactory::fromInt32INT(const parque { switch (int_type.bit_width()) { + case 8: + { + if (int_type.is_signed()) + return makeLeafReader(); + return makeLeafReader(); + } + case 16: + { + if (int_type.is_signed()) + return makeLeafReader(); + return makeLeafReader(); + } case 32: { if (int_type.is_signed()) - return makeLeafReader(); - return makeLeafReader(); + return makeLeafReader(); + return makeLeafReader(); } default: return throwUnsupported(PreformattedMessage::create(", bit width: {}", int_type.bit_width())); @@ -245,8 +258,8 @@ std::unique_ptr ColReaderFactory::fromInt64INT(const parque case 64: { if (int_type.is_signed()) - return makeLeafReader(); - return makeLeafReader(); + return makeLeafReader(); + return makeLeafReader(); } default: return throwUnsupported(PreformattedMessage::create(", bit width: {}", int_type.bit_width())); @@ -263,7 +276,7 @@ std::unique_ptr ColReaderFactory::makeReader() switch (col_descriptor.physical_type()) { case parquet::Type::BOOLEAN: - return makeLeafReader(); + return makeLeafReader(); case parquet::Type::INT32: return fromInt32(); case parquet::Type::INT64: @@ -276,13 +289,13 @@ std::unique_ptr ColReaderFactory::makeReader() auto scale = getScaleFromArrowTimeUnit(arrow_properties.coerce_int96_timestamp_unit()); read_type = std::make_shared(scale); } - return std::make_unique>>( + return std::make_unique, ParquetInt96TypeStub>>( col_descriptor, read_type, std::move(meta), std::move(page_reader)); } case parquet::Type::FLOAT: - return makeLeafReader(); + return makeLeafReader(); case parquet::Type::DOUBLE: - return makeLeafReader(); + return makeLeafReader(); case parquet::Type::BYTE_ARRAY: return fromByteArray(); case parquet::Type::FIXED_LEN_BYTE_ARRAY: diff --git a/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.reference b/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh b/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh new file mode 100644 index 00000000000..2c234d6040d --- /dev/null +++ b/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Tags: no-ubsan, no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + + +USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +WORKING_DIR="${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME}" + +mkdir -p "${WORKING_DIR}" + +DATA_FILE="${CUR_DIR}/data_parquet/multi_column_bf.gz.parquet" + +DATA_FILE_USER_PATH="${WORKING_DIR}/multi_column_bf.gz.parquet" + +cp ${DATA_FILE} ${DATA_FILE_USER_PATH} + +${CLICKHOUSE_CLIENT} --query="select int8_logical, uint16_logical, uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) order by uint64_logical limit 10; From cbda101228e9c856d2c1550c7691be4d34d2e4ea Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 20 Nov 2024 12:42:45 -0300 Subject: [PATCH 2/3] missing double quotes --- .../03262_test_parquet_native_reader_int_logical_type.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh b/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh index 2c234d6040d..dc47b61d7aa 100644 --- a/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh +++ b/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh @@ -18,4 +18,4 @@ DATA_FILE_USER_PATH="${WORKING_DIR}/multi_column_bf.gz.parquet" cp ${DATA_FILE} ${DATA_FILE_USER_PATH} -${CLICKHOUSE_CLIENT} --query="select int8_logical, uint16_logical, uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) order by uint64_logical limit 10; +${CLICKHOUSE_CLIENT} --query="select int8_logical, uint16_logical, uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) order by uint64_logical limit 10"; From bc866e64ea08649727ca477464fb109daca6bc62 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 20 Nov 2024 15:26:02 -0300 Subject: [PATCH 3/3] minor adjustment and tests --- .../Impl/Parquet/ParquetDataValuesReader.cpp | 4 +- ...t_native_reader_int_logical_type.reference | 40 +++++++++++++++++++ ..._parquet_native_reader_int_logical_type.sh | 3 +- 3 files changed, 44 insertions(+), 3 deletions(-) mode change 100644 => 100755 tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index 758627f833d..6ccbde8eabb 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -365,11 +365,11 @@ void ParquetPlainValuesReader::readBatch( null_map, /* individual_visitor */ [&](size_t nest_cursor) { - plain_data_buffer.readValue(column_data[nest_cursor]); + plain_data_buffer.readValuesOfDifferentSize(column_data + nest_cursor, 1); }, /* repeated_visitor */ [&](size_t nest_cursor, UInt32 count) { - plain_data_buffer.readBytes(column_data + nest_cursor, count * sizeof(TValue)); + plain_data_buffer.readValuesOfDifferentSize(column_data + nest_cursor, count); } ); } diff --git a/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.reference b/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.reference index e69de29bb2d..73ed2f206f8 100644 --- a/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.reference +++ b/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.reference @@ -0,0 +1,40 @@ +-94 53304 17815465730223871 +57 15888 33652524900575246 +-4 14877 53832092832965652 +33 3387 86326601511136103 +104 3383 115438187156564782 +-11 37403 145056169255259589 +-72 46473 159324626361233509 +103 35510 173644182696185097 +-26 60902 185175917734318892 +70 48767 193167023342307884 +2 21648 247953090704786001 +20 2986 268127160817221407 +76 20277 290178827409195337 +61 28692 305149163504092270 +-74 65427 326871531363668398 +-15 20256 351812901947846888 +-39 65472 357371822264135234 +79 38671 371605113770958364 +-29 41706 394460710549666968 +92 25026 412913269933311543 +-94 53304 17815465730223871 +57 15888 33652524900575246 +-4 14877 53832092832965652 +33 3387 86326601511136103 +104 3383 115438187156564782 +-11 37403 145056169255259589 +-72 46473 159324626361233509 +103 35510 173644182696185097 +-26 60902 185175917734318892 +70 48767 193167023342307884 +2 21648 247953090704786001 +20 2986 268127160817221407 +76 20277 290178827409195337 +61 28692 305149163504092270 +-74 65427 326871531363668398 +-15 20256 351812901947846888 +-39 65472 357371822264135234 +79 38671 371605113770958364 +-29 41706 394460710549666968 +92 25026 412913269933311543 diff --git a/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh b/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh old mode 100644 new mode 100755 index dc47b61d7aa..7d05171516f --- a/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh +++ b/tests/queries/0_stateless/03262_test_parquet_native_reader_int_logical_type.sh @@ -18,4 +18,5 @@ DATA_FILE_USER_PATH="${WORKING_DIR}/multi_column_bf.gz.parquet" cp ${DATA_FILE} ${DATA_FILE_USER_PATH} -${CLICKHOUSE_CLIENT} --query="select int8_logical, uint16_logical, uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) order by uint64_logical limit 10"; +${CLICKHOUSE_CLIENT} --query="select int8_logical, uint16_logical, uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) order by uint64_logical limit 20 SETTINGS input_format_parquet_use_native_reader=false;"; +${CLICKHOUSE_CLIENT} --query="select int8_logical, uint16_logical, uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) order by uint64_logical limit 20 SETTINGS input_format_parquet_use_native_reader=true;";