From 03e0e9a14b6099bf41f402b57065aedfc6515f0b Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 24 Oct 2024 16:05:19 -0300 Subject: [PATCH] draft --- .../Impl/Parquet/ParquetDataValuesReader.cpp | 43 +++++++++++++++++++ .../Impl/Parquet/ParquetDataValuesReader.h | 21 +++++++++ .../Impl/Parquet/ParquetLeafColReader.cpp | 23 ++++++++++ .../Impl/Parquet/ParquetRecordReader.cpp | 2 +- 4 files changed, 88 insertions(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index b8e4db8700c..977f2ad298b 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -296,6 +296,40 @@ void ParquetPlainValuesReader::readBatch( ); } +template <> +void ParquetBitPlainReader::readBatch( + MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) +{ + auto & column = *assert_cast(col_ptr.get()); + auto cursor = column.size(); + + auto & container = column.getData(); + + container.resize(cursor + num_values); + + def_level_reader->visitNullableValues( + cursor, + num_values, + max_def_level, + null_map, + /* individual_visitor */ [&](size_t nest_cursor) + { + uint8_t byte; + bit_reader->GetValue(1, &byte); + container[nest_cursor] = byte; + }, + /* repeated_visitor */ [&](size_t nest_cursor, UInt32 count) + { + for (UInt32 i = 0; i < count; i++) + { + uint8_t byte; + bit_reader->GetValue(1, &byte); + container[nest_cursor++] = byte; + } + } + ); +} + template <> void ParquetPlainValuesReader, ParquetReaderTypes::TimestampInt96>::readBatch( @@ -515,6 +549,13 @@ void ParquetRleDictReader::readBatch( ); } +template <> +void ParquetRleDictReader::readBatch( + MutableColumnPtr & , LazyNullMap &, UInt32) +{ + assert(false); +} + template void ParquetRleDictReader::readBatch( MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) @@ -561,6 +602,7 @@ template class ParquetPlainValuesReader>; template class ParquetPlainValuesReader>; template class ParquetPlainValuesReader>; template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; template class ParquetFixedLenPlainReader>; template class ParquetFixedLenPlainReader>; @@ -569,6 +611,7 @@ template class ParquetRleLCReader; template class ParquetRleLCReader; template class ParquetRleLCReader; +template class ParquetRleDictReader; template class ParquetRleDictReader; template class ParquetRleDictReader; template class ParquetRleDictReader; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h index fbccb612b3c..db55f7e2d6a 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h @@ -172,6 +172,27 @@ private: ParquetDataBuffer plain_data_buffer; }; +template +class ParquetBitPlainReader : public ParquetDataValuesReader +{ +public: + ParquetBitPlainReader( + Int32 max_def_level_, + std::unique_ptr def_level_reader_, + std::unique_ptr bit_reader_) + : max_def_level(max_def_level_) + , def_level_reader(std::move(def_level_reader_)) + , bit_reader(std::move(bit_reader_)) + {} + + void readBatch(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) override; + +private: + Int32 max_def_level; + std::unique_ptr def_level_reader; + std::unique_ptr bit_reader; +}; + /** * The data and definition level encoding are same as ParquetPlainValuesReader. * But the element size is const and bigger than primitive data type. diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp index 4b5880eba37..f32d7e61062 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp @@ -463,6 +463,28 @@ void ParquetLeafColReader::initDataReader( } } +template <> +void ParquetLeafColReader::initDataReader( + parquet::Encoding::type enconding_type, + const uint8_t * buffer, + std::size_t max_size, + std::unique_ptr && def_level_reader) +{ + switch (enconding_type) + { + case parquet::Encoding::PLAIN: + { + auto bit_reader = std::make_unique(buffer, max_size); + data_values_reader = std::make_unique>(col_descriptor.max_definition_level(), + std::move(def_level_reader), + std::move(bit_reader)); + break; + } + default: + throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unknown encoding type: {}", enconding_type); + } +} + template void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) { @@ -620,6 +642,7 @@ std::unique_ptr ParquetLeafColReader::createDi } +template class ParquetLeafColReader; template class ParquetLeafColReader; template class ParquetLeafColReader; template class ParquetLeafColReader; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index acf11a30162..971bb9e1be5 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -263,7 +263,7 @@ std::unique_ptr ColReaderFactory::makeReader() switch (col_descriptor.physical_type()) { case parquet::Type::BOOLEAN: - break; + return makeLeafReader(); case parquet::Type::INT32: return fromInt32(); case parquet::Type::INT64: