Merge pull request #54809 from ClickHouse/pqmeta

Prevent ParquetMetadata reading 40 MB from each file unnecessarily
This commit is contained in:
Kruglov Pavel 2023-09-20 12:53:22 +02:00 committed by GitHub
commit 49ee14f701
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -130,7 +130,7 @@ static std::shared_ptr<parquet::FileMetaData> getFileMetadata(
const FormatSettings & format_settings,
std::atomic<int> & is_stopped)
{
auto arrow_file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES);
auto arrow_file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true);
return parquet::ReadMetaData(arrow_file);
}
@ -495,12 +495,15 @@ NamesAndTypesList ParquetMetadataSchemaReader::readSchema()
void registerInputFormatParquetMetadata(FormatFactory & factory)
{
factory.registerInputFormat(
factory.registerRandomAccessInputFormat(
"ParquetMetadata",
[](ReadBuffer &buf,
const Block &sample,
const RowInputFormatParams &,
const FormatSettings & settings)
[](ReadBuffer & buf,
const Block & sample,
const FormatSettings & settings,
const ReadSettings &,
bool /* is_remote_fs */,
size_t /* max_download_threads */,
size_t /* max_parsing_threads */)
{
return std::make_shared<ParquetMetadataInputFormat>(buf, sample, settings);
});