This commit is contained in:
Arthur Passos 2024-10-24 16:05:19 -03:00
parent e1025300d0
commit 03e0e9a14b
4 changed files with 88 additions and 1 deletions

View File

@ -296,6 +296,40 @@ void ParquetPlainValuesReader<ColumnString>::readBatch(
);
}
template <>
void ParquetBitPlainReader<ColumnUInt8>::readBatch(
MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values)
{
auto & column = *assert_cast<ColumnUInt8 *>(col_ptr.get());
auto cursor = column.size();
auto & container = column.getData();
container.resize(cursor + num_values);
def_level_reader->visitNullableValues(
cursor,
num_values,
max_def_level,
null_map,
/* individual_visitor */ [&](size_t nest_cursor)
{
uint8_t byte;
bit_reader->GetValue(1, &byte);
container[nest_cursor] = byte;
},
/* repeated_visitor */ [&](size_t nest_cursor, UInt32 count)
{
for (UInt32 i = 0; i < count; i++)
{
uint8_t byte;
bit_reader->GetValue(1, &byte);
container[nest_cursor++] = byte;
}
}
);
}
template <>
void ParquetPlainValuesReader<ColumnDecimal<DateTime64>, ParquetReaderTypes::TimestampInt96>::readBatch(
@ -515,6 +549,13 @@ void ParquetRleDictReader<ColumnString>::readBatch(
);
}
template <>
void ParquetRleDictReader<ColumnUInt8>::readBatch(
MutableColumnPtr & , LazyNullMap &, UInt32)
{
assert(false);
}
template <typename TColumnVector>
void ParquetRleDictReader<TColumnVector>::readBatch(
MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values)
@ -561,6 +602,7 @@ template class ParquetPlainValuesReader<ColumnDecimal<Decimal32>>;
template class ParquetPlainValuesReader<ColumnDecimal<Decimal64>>;
template class ParquetPlainValuesReader<ColumnDecimal<DateTime64>>;
template class ParquetPlainValuesReader<ColumnString>;
template class ParquetPlainValuesReader<ColumnUInt8>;
template class ParquetFixedLenPlainReader<ColumnDecimal<Decimal128>>;
template class ParquetFixedLenPlainReader<ColumnDecimal<Decimal256>>;
@ -569,6 +611,7 @@ template class ParquetRleLCReader<ColumnUInt8>;
template class ParquetRleLCReader<ColumnUInt16>;
template class ParquetRleLCReader<ColumnUInt32>;
template class ParquetRleDictReader<ColumnUInt8>;
template class ParquetRleDictReader<ColumnInt32>;
template class ParquetRleDictReader<ColumnUInt32>;
template class ParquetRleDictReader<ColumnInt64>;

View File

@ -172,6 +172,27 @@ private:
ParquetDataBuffer plain_data_buffer;
};
template <typename TColumn>
class ParquetBitPlainReader : public ParquetDataValuesReader
{
public:
ParquetBitPlainReader(
Int32 max_def_level_,
std::unique_ptr<RleValuesReader> def_level_reader_,
std::unique_ptr<arrow::bit_util::BitReader> bit_reader_)
: max_def_level(max_def_level_)
, def_level_reader(std::move(def_level_reader_))
, bit_reader(std::move(bit_reader_))
{}
void readBatch(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) override;
private:
Int32 max_def_level;
std::unique_ptr<RleValuesReader> def_level_reader;
std::unique_ptr<arrow::bit_util::BitReader> bit_reader;
};
/**
* The data and definition level encoding are same as ParquetPlainValuesReader.
* But the element size is const and bigger than primitive data type.

View File

@ -463,6 +463,28 @@ void ParquetLeafColReader<TColumn>::initDataReader(
}
}
template <>
void ParquetLeafColReader<ColumnUInt8>::initDataReader(
parquet::Encoding::type enconding_type,
const uint8_t * buffer,
std::size_t max_size,
std::unique_ptr<RleValuesReader> && def_level_reader)
{
switch (enconding_type)
{
case parquet::Encoding::PLAIN:
{
auto bit_reader = std::make_unique<arrow::bit_util::BitReader>(buffer, max_size);
data_values_reader = std::make_unique<ParquetBitPlainReader<ColumnUInt8>>(col_descriptor.max_definition_level(),
std::move(def_level_reader),
std::move(bit_reader));
break;
}
default:
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unknown encoding type: {}", enconding_type);
}
}
template <typename TColumn>
void ParquetLeafColReader<TColumn>::readPageV1(const parquet::DataPageV1 & page)
{
@ -620,6 +642,7 @@ std::unique_ptr<ParquetDataValuesReader> ParquetLeafColReader<TColumn>::createDi
}
template class ParquetLeafColReader<ColumnUInt8>;
template class ParquetLeafColReader<ColumnInt32>;
template class ParquetLeafColReader<ColumnUInt32>;
template class ParquetLeafColReader<ColumnInt64>;

View File

@ -263,7 +263,7 @@ std::unique_ptr<ParquetColumnReader> ColReaderFactory::makeReader()
switch (col_descriptor.physical_type())
{
case parquet::Type::BOOLEAN:
break;
return makeLeafReader<DataTypeUInt8>();
case parquet::Type::INT32:
return fromInt32();
case parquet::Type::INT64: