From b277a5c943a1cc5e64d78c8dbcc737aad9cf1539 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 18 Apr 2023 11:07:08 +0000 Subject: [PATCH] Add ParquetMetadata input format to read Parquet file metadata --- docs/en/interfaces/formats.md | 272 +++++++--- src/Formats/registerFormats.cpp | 5 + .../Impl/ParquetMetadataInputFormat.cpp | 499 ++++++++++++++++++ .../Formats/Impl/ParquetMetadataInputFormat.h | 90 ++++ .../02718_parquet_metadata_format.reference | 154 ++++++ .../02718_parquet_metadata_format.sh | 7 + .../data_parquet/02718_data.parquet | Bin 0 -> 28165 bytes 7 files changed, 952 insertions(+), 75 deletions(-) create mode 100644 src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp create mode 100644 src/Processors/Formats/Impl/ParquetMetadataInputFormat.h create mode 100644 tests/queries/0_stateless/02718_parquet_metadata_format.reference create mode 100755 tests/queries/0_stateless/02718_parquet_metadata_format.sh create mode 100644 tests/queries/0_stateless/data_parquet/02718_data.parquet diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index b4823d5ebaf..b17c3c14f73 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -10,80 +10,82 @@ results of a `SELECT`, and to perform `INSERT`s into a file-backed table. The supported formats are: -| Format | Input | Output | -|-------------------------------------------------------------------------------------------|------|--------| -| [TabSeparated](#tabseparated) | ✔ | ✔ | -| [TabSeparatedRaw](#tabseparatedraw) | ✔ | ✔ | -| [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | -| [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | -| [TabSeparatedRawWithNames](#tabseparatedrawwithnames) | ✔ | ✔ | -| [TabSeparatedRawWithNamesAndTypes](#tabseparatedrawwithnamesandtypes) | ✔ | ✔ | -| [Template](#format-template) | ✔ | ✔ | -| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | -| [CSV](#csv) | ✔ | ✔ | -| [CSVWithNames](#csvwithnames) | ✔ | ✔ | -| [CSVWithNamesAndTypes](#csvwithnamesandtypes) | ✔ | ✔ | -| [CustomSeparated](#format-customseparated) | ✔ | ✔ | -| [CustomSeparatedWithNames](#customseparatedwithnames) | ✔ | ✔ | -| [CustomSeparatedWithNamesAndTypes](#customseparatedwithnamesandtypes) | ✔ | ✔ | -| [SQLInsert](#sqlinsert) | ✗ | ✔ | -| [Values](#data-format-values) | ✔ | ✔ | -| [Vertical](#vertical) | ✗ | ✔ | -| [JSON](#json) | ✔ | ✔ | -| [JSONAsString](#jsonasstring) | ✔ | ✗ | -| [JSONStrings](#jsonstrings) | ✔ | ✔ | -| [JSONColumns](#jsoncolumns) | ✔ | ✔ | -| [JSONColumnsWithMetadata](#jsoncolumnsmonoblock)) | ✔ | ✔ | -| [JSONCompact](#jsoncompact) | ✔ | ✔ | -| [JSONCompactStrings](#jsoncompactstrings) | ✗ | ✔ | -| [JSONCompactColumns](#jsoncompactcolumns) | ✔ | ✔ | -| [JSONEachRow](#jsoneachrow) | ✔ | ✔ | -| [JSONEachRowWithProgress](#jsoneachrowwithprogress) | ✗ | ✔ | -| [JSONStringsEachRow](#jsonstringseachrow) | ✔ | ✔ | -| [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress) | ✗ | ✔ | -| [JSONCompactEachRow](#jsoncompacteachrow) | ✔ | ✔ | -| [JSONCompactEachRowWithNames](#jsoncompacteachrowwithnames) | ✔ | ✔ | -| [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes) | ✔ | ✔ | -| [JSONCompactStringsEachRow](#jsoncompactstringseachrow) | ✔ | ✔ | -| [JSONCompactStringsEachRowWithNames](#jsoncompactstringseachrowwithnames) | ✔ | ✔ | +| Format | Input | Output | +|------------------------------------------------------------------------------------------|------|--------| +| [TabSeparated](#tabseparated) | ✔ | ✔ | +| [TabSeparatedRaw](#tabseparatedraw) | ✔ | ✔ | +| [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | +| [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | +| [TabSeparatedRawWithNames](#tabseparatedrawwithnames) | ✔ | ✔ | +| [TabSeparatedRawWithNamesAndTypes](#tabseparatedrawwithnamesandtypes) | ✔ | ✔ | +| [Template](#format-template) | ✔ | ✔ | +| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | +| [CSV](#csv) | ✔ | ✔ | +| [CSVWithNames](#csvwithnames) | ✔ | ✔ | +| [CSVWithNamesAndTypes](#csvwithnamesandtypes) | ✔ | ✔ | +| [CustomSeparated](#format-customseparated) | ✔ | ✔ | +| [CustomSeparatedWithNames](#customseparatedwithnames) | ✔ | ✔ | +| [CustomSeparatedWithNamesAndTypes](#customseparatedwithnamesandtypes) | ✔ | ✔ | +| [SQLInsert](#sqlinsert) | ✗ | ✔ | +| [Values](#data-format-values) | ✔ | ✔ | +| [Vertical](#vertical) | ✗ | ✔ | +| [JSON](#json) | ✔ | ✔ | +| [JSONAsString](#jsonasstring) | ✔ | ✗ | +| [JSONStrings](#jsonstrings) | ✔ | ✔ | +| [JSONColumns](#jsoncolumns) | ✔ | ✔ | +| [JSONColumnsWithMetadata](#jsoncolumnsmonoblock)) | ✔ | ✔ | +| [JSONCompact](#jsoncompact) | ✔ | ✔ | +| [JSONCompactStrings](#jsoncompactstrings) | ✗ | ✔ | +| [JSONCompactColumns](#jsoncompactcolumns) | ✔ | ✔ | +| [JSONEachRow](#jsoneachrow) | ✔ | ✔ | +| [PrettyJSONEachRow](#prettyjsoneachrow) | ✗ | ✔ | +| [JSONEachRowWithProgress](#jsoneachrowwithprogress) | ✗ | ✔ | +| [JSONStringsEachRow](#jsonstringseachrow) | ✔ | ✔ | +| [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress) | ✗ | ✔ | +| [JSONCompactEachRow](#jsoncompacteachrow) | ✔ | ✔ | +| [JSONCompactEachRowWithNames](#jsoncompacteachrowwithnames) | ✔ | ✔ | +| [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes) | ✔ | ✔ | +| [JSONCompactStringsEachRow](#jsoncompactstringseachrow) | ✔ | ✔ | +| [JSONCompactStringsEachRowWithNames](#jsoncompactstringseachrowwithnames) | ✔ | ✔ | | [JSONCompactStringsEachRowWithNamesAndTypes](#jsoncompactstringseachrowwithnamesandtypes) | ✔ | ✔ | -| [JSONObjectEachRow](#jsonobjecteachrow) | ✔ | ✔ | -| [BSONEachRow](#bsoneachrow) | ✔ | ✔ | -| [TSKV](#tskv) | ✔ | ✔ | -| [Pretty](#pretty) | ✗ | ✔ | -| [PrettyNoEscapes](#prettynoescapes) | ✗ | ✔ | -| [PrettyMonoBlock](#prettymonoblock) | ✗ | ✔ | -| [PrettyNoEscapesMonoBlock](#prettynoescapesmonoblock) | ✗ | ✔ | -| [PrettyCompact](#prettycompact) | ✗ | ✔ | -| [PrettyCompactNoEscapes](#prettycompactnoescapes) | ✗ | ✔ | -| [PrettyCompactMonoBlock](#prettycompactmonoblock) | ✗ | ✔ | -| [PrettyCompactNoEscapesMonoBlock](#prettycompactnoescapesmonoblock) | ✗ | ✔ | -| [PrettySpace](#prettyspace) | ✗ | ✔ | -| [PrettySpaceNoEscapes](#prettyspacenoescapes) | ✗ | ✔ | -| [PrettySpaceMonoBlock](#prettyspacemonoblock) | ✗ | ✔ | -| [PrettySpaceNoEscapesMonoBlock](#prettyspacenoescapesmonoblock) | ✗ | ✔ | -| [Prometheus](#prometheus) | ✗ | ✔ | -| [Protobuf](#protobuf) | ✔ | ✔ | -| [ProtobufSingle](#protobufsingle) | ✔ | ✔ | -| [Avro](#data-format-avro) | ✔ | ✔ | -| [AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ | -| [Parquet](#data-format-parquet) | ✔ | ✔ | -| [Arrow](#data-format-arrow) | ✔ | ✔ | -| [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | -| [ORC](#data-format-orc) | ✔ | ✔ | -| [RowBinary](#rowbinary) | ✔ | ✔ | -| [RowBinaryWithNames](#rowbinarywithnamesandtypes) | ✔ | ✔ | -| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | -| [Native](#native) | ✔ | ✔ | -| [Null](#null) | ✗ | ✔ | -| [XML](#xml) | ✗ | ✔ | -| [CapnProto](#capnproto) | ✔ | ✔ | -| [LineAsString](#lineasstring) | ✔ | ✔ | -| [Regexp](#data-format-regexp) | ✔ | ✗ | -| [RawBLOB](#rawblob) | ✔ | ✔ | -| [MsgPack](#msgpack) | ✔ | ✔ | -| [MySQLDump](#mysqldump) | ✔ | ✗ | -| [Markdown](#markdown) | ✗ | ✔ | +| [JSONObjectEachRow](#jsonobjecteachrow) | ✔ | ✔ | +| [BSONEachRow](#bsoneachrow) | ✔ | ✔ | +| [TSKV](#tskv) | ✔ | ✔ | +| [Pretty](#pretty) | ✗ | ✔ | +| [PrettyNoEscapes](#prettynoescapes) | ✗ | ✔ | +| [PrettyMonoBlock](#prettymonoblock) | ✗ | ✔ | +| [PrettyNoEscapesMonoBlock](#prettynoescapesmonoblock) | ✗ | ✔ | +| [PrettyCompact](#prettycompact) | ✗ | ✔ | +| [PrettyCompactNoEscapes](#prettycompactnoescapes) | ✗ | ✔ | +| [PrettyCompactMonoBlock](#prettycompactmonoblock) | ✗ | ✔ | +| [PrettyCompactNoEscapesMonoBlock](#prettycompactnoescapesmonoblock) | ✗ | ✔ | +| [PrettySpace](#prettyspace) | ✗ | ✔ | +| [PrettySpaceNoEscapes](#prettyspacenoescapes) | ✗ | ✔ | +| [PrettySpaceMonoBlock](#prettyspacemonoblock) | ✗ | ✔ | +| [PrettySpaceNoEscapesMonoBlock](#prettyspacenoescapesmonoblock) | ✗ | ✔ | +| [Prometheus](#prometheus) | ✗ | ✔ | +| [Protobuf](#protobuf) | ✔ | ✔ | +| [ProtobufSingle](#protobufsingle) | ✔ | ✔ | +| [Avro](#data-format-avro) | ✔ | ✔ | +| [AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ | +| [Parquet](#data-format-parquet) | ✔ | ✔ | +| [ParqueMetadata](#data-format-parquet-metadata) | ✔ | ✗ | +| [Arrow](#data-format-arrow) | ✔ | ✔ | +| [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | +| [ORC](#data-format-orc) | ✔ | ✔ | +| [RowBinary](#rowbinary) | ✔ | ✔ | +| [RowBinaryWithNames](#rowbinarywithnamesandtypes) | ✔ | ✔ | +| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | +| [Native](#native) | ✔ | ✔ | +| [Null](#null) | ✗ | ✔ | +| [XML](#xml) | ✗ | ✔ | +| [CapnProto](#capnproto) | ✔ | ✔ | +| [LineAsString](#lineasstring) | ✔ | ✔ | +| [Regexp](#data-format-regexp) | ✔ | ✗ | +| [RawBLOB](#rawblob) | ✔ | ✔ | +| [MsgPack](#msgpack) | ✔ | ✔ | +| [MySQLDump](#mysqldump) | ✔ | ✗ | +| [Markdown](#markdown) | ✗ | ✔ | You can control some format processing parameters with the ClickHouse settings. For more information read the [Settings](/docs/en/operations/settings/settings-formats.md) section. @@ -915,8 +917,6 @@ Example: {"num":44,"str":"hello","arr":[0,1,2,3]} ``` -While importing data columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1. - ## JSONStringsEachRow {#jsonstringseachrow} Differs from JSONEachRow only in that data fields are output in strings, not in typed JSON values. @@ -2003,6 +2003,128 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t - [output_format_parquet_version](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_version) - The version of Parquet format used in output format. Default value - `2.latest`. - [output_format_parquet_compression_method](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_compression_method) - compression method used in output Parquet format. Default value - `snappy`. +## ParquetMetadata {data-format-parquet-metadata} + +Special format for reading Parquet file metadata (https://parquet.apache.org/docs/file-format/metadata/). It always outputs one row with the next structure/content: +- num_columns - the number of columns +- num_rows - the total number of rows +- num_row_groups - the total number of row groups +- format_version - parquet format version, always 1.0 or 2.6 +- total_byte_size - total bytes size of the data, calculated as the sum of total_byte_size from all row groups +- total_compressed_size - total compressed bytes size of the data, calculated as the sum of total_compressed_size from all row groups +- columns - the list of columns metadata with the next structure: + - name - column name + - path - column path (differs from name for nested column) + - max_definition_level - maximum definition level + - max_repetition_level - maximum repetition level + - physical_type - column physical type + - logical_type - column logical type + - compression - compression used for this column + - encodings - the list of encodings used for this column +- row_groups - the list of row groups metadata with the next structure: + - num_columns - the number of columns in the row group + - num_rows - the number of rows in the row group + - total_byte_size - total bytes size of the row group + - total_compressed_size - total compressed bytes size of the row group + - columns - the list of column chunks metadata with the next structure: + - name - column name + - path - column path + - total_compressed_size - total compressed bytes size of the column + - total_uncompressed_size - total uncompressed bytes size of the row group + - have_statistics - bool flag that indicates if column chunk metadata contains column statistics + - statistics - column chunk statistics (all fields are NULL if have_statistics = false) with the next structure: + - num_values - the number of non-null values in the column chunk + - null_count - the number of NULL values in the column chunk + - distinct_count - the number of distinct values in the column chunk + - min - the minimum value of the column chunk + - max - the maximum column of the column chunk + +Example: + +```sql +SELECT * FROM file(data.parquet, ParquetMetadata) format PrettyJSONEachRow +``` + +```json +{ + "num_columns": "2", + "num_rows": "1000000", + "num_row_groups": "16", + "format_version": "2.6", + "total_byte_size": "10001981", + "total_compressed_size": "6011415", + "columns": [ + { + "name": "number", + "path": "number", + "max_definition_level": "0", + "max_repetition_level": "0", + "physical_type": "INT64", + "logical_type": "Int(bitWidth=64, isSigned=false)", + "compression": "LZ4", + "encodings": [ + "RLE_DICTIONARY", + "PLAIN", + "RLE" + ] + }, + { + "name": "'Hello'", + "path": "'Hello'", + "max_definition_level": "0", + "max_repetition_level": "0", + "physical_type": "BYTE_ARRAY", + "logical_type": "None", + "compression": "LZ4", + "encodings": [ + "RLE_DICTIONARY", + "PLAIN", + "RLE" + ] + } + ], + "row_groups": [ + { + "num_columns": "2", + "num_rows": "65409", + "total_byte_size": "654367", + "total_compressed_size": "393396", + "columns": [ + { + "name": "number", + "path": "number", + "total_compressed_size": "393329", + "total_uncompressed_size": "654302", + "have_statistics": true, + "statistics": { + "num_values": "65409", + "null_count": "0", + "distinct_count": null, + "min": "0", + "max": "65408" + } + }, + { + "name": "'Hello'", + "path": "'Hello'", + "total_compressed_size": "67", + "total_uncompressed_size": "65", + "have_statistics": true, + "statistics": { + "num_values": "65409", + "null_count": "0", + "distinct_count": null, + "min": "Hello", + "max": "Hello" + } + } + ] + }, + ... + ] +} +``` + ## Arrow {#data-format-arrow} [Apache Arrow](https://arrow.apache.org/) comes with two built-in columnar storage formats. ClickHouse supports read and write operations for these formats. diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index 285e234167b..29ef46f330f 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -100,6 +100,7 @@ void registerInputFormatJSONAsString(FormatFactory & factory); void registerInputFormatJSONAsObject(FormatFactory & factory); void registerInputFormatLineAsString(FormatFactory & factory); void registerInputFormatMySQLDump(FormatFactory & factory); +void registerInputFormatParquetMetadata(FormatFactory & factory); #if USE_HIVE void registerInputFormatHiveText(FormatFactory & factory); @@ -140,6 +141,7 @@ void registerValuesSchemaReader(FormatFactory & factory); void registerTemplateSchemaReader(FormatFactory & factory); void registerMySQLSchemaReader(FormatFactory & factory); void registerBSONEachRowSchemaReader(FormatFactory & factory); +void registerParquetMetadataSchemaReader(FormatFactory & factory); void registerFileExtensions(FormatFactory & factory); @@ -240,6 +242,8 @@ void registerFormats() registerInputFormatCapnProto(factory); registerInputFormatMySQLDump(factory); + registerInputFormatParquetMetadata(factory); + registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(factory); registerNonTrivialPrefixAndSuffixCheckerJSONAsString(factory); registerNonTrivialPrefixAndSuffixCheckerJSONAsObject(factory); @@ -274,6 +278,7 @@ void registerFormats() registerTemplateSchemaReader(factory); registerMySQLSchemaReader(factory); registerBSONEachRowSchemaReader(factory); + registerParquetMetadataSchemaReader(factory); } } diff --git a/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp b/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp new file mode 100644 index 00000000000..c384c3811db --- /dev/null +++ b/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp @@ -0,0 +1,499 @@ +#include "ParquetMetadataInputFormat.h" + +#if USE_PARQUET + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ArrowBufferedStreams.h" +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +static NamesAndTypesList getHeaderForParquetMetadata() +{ + NamesAndTypesList names_and_types{ + {"num_columns", std::make_shared()}, + {"num_rows", std::make_shared()}, + {"num_row_groups", std::make_shared()}, + {"format_version", std::make_shared()}, + {"total_byte_size", std::make_shared()}, + {"total_compressed_size", std::make_shared()}, + {"columns", + std::make_shared( + std::make_shared( + DataTypes{ + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(std::make_shared())}, + Names{ + "name", + "path", + "max_definition_level", + "max_repetition_level", + "physical_type", + "logical_type", + "compression", + "encodings"}))}, + {"row_groups", + std::make_shared(std::make_shared( + DataTypes{ + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared( + std::make_shared( + DataTypes{ + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), + DataTypeFactory::instance().get("Bool"), + std::make_shared( + DataTypes{ + std::make_shared(std::make_shared()), + std::make_shared(std::make_shared()), + std::make_shared(std::make_shared()), + std::make_shared(std::make_shared()), + std::make_shared(std::make_shared())}, + Names{"num_values", "null_count", "distinct_count", "min", "max"}), + }, + Names{"name", "path", "total_compressed_size", "total_uncompressed_size", "have_statistics", "statistics"}))}, + Names{"num_columns", "num_rows", "total_byte_size", "total_compressed_size", "columns"}))}, + }; + return names_and_types; +} + +void checkHeader(const Block & header) +{ + auto expected_names_and_types = getHeaderForParquetMetadata(); + std::unordered_map name_to_type; + for (const auto & [name, type] : expected_names_and_types) + name_to_type[name] = type; + + for (const auto & [name, type] : header.getNamesAndTypes()) + { + auto it = name_to_type.find(name); + if (it == name_to_type.end()) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Unexpected column: {}. ParquetMetadata format allows only the next columns: num_columns, num_rows, num_row_groups, " + "format_version, columns, row_groups", name); + + if (!it->second->equals(*type)) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Unexpected type {} for column {}. Expected type: {}", + type->getName(), + name, + it->second->getName()); + } +} + +static std::shared_ptr getFileMetadata( + ReadBuffer & in, + const FormatSettings & format_settings, + std::atomic & is_stopped) +{ + auto arrow_file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES); + return parquet::ReadMetaData(arrow_file); +} + +ParquetMetadataInputFormat::ParquetMetadataInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_) + : IInputFormat(std::move(header_), in_), format_settings(format_settings_) +{ + checkHeader(getPort().getHeader()); +} + +Chunk ParquetMetadataInputFormat::generate() +{ + Chunk res; + if (done) + return res; + + auto metadata = getFileMetadata(*in, format_settings, is_stopped); + + const auto & header = getPort().getHeader(); + auto names_and_types = getHeaderForParquetMetadata(); + auto names = names_and_types.getNames(); + auto types = names_and_types.getTypes(); + + for (const auto & name : header.getNames()) + { + /// num_columns + if (name == names[0]) + { + auto column = types[0]->createColumn(); + assert_cast(*column).insertValue(metadata->num_columns()); + res.addColumn(std::move(column)); + } + /// num_rows + else if (name == names[1]) + { + auto column = types[1]->createColumn(); + assert_cast(*column).insertValue(metadata->num_rows()); + res.addColumn(std::move(column)); + } + /// num_row_groups + else if (name == names[2]) + { + auto column = types[2]->createColumn(); + assert_cast(*column).insertValue(metadata->num_row_groups()); + res.addColumn(std::move(column)); + } + /// format_version + else if (name == names[3]) + { + auto column = types[3]->createColumn(); + String version = metadata->version() == parquet::ParquetVersion::PARQUET_1_0 ? "1.0" : "2.6"; + assert_cast(*column).insertData(version.data(), version.size()); + res.addColumn(std::move(column)); + } + /// total_byte_size + else if (name == names[4]) + { + auto column = types[4]->createColumn(); + size_t total_byte_size = 0; + for (int32_t i = 0; i != metadata->num_row_groups(); ++i) + total_byte_size += metadata->RowGroup(i)->total_byte_size(); + + assert_cast(*column).insertValue(total_byte_size); + res.addColumn(std::move(column)); + } + /// total_compressed_size + else if (name == names[5]) + { + auto column = types[5]->createColumn(); + size_t total_compressed_size = 0; + for (int32_t i = 0; i != metadata->num_row_groups(); ++i) + total_compressed_size += metadata->RowGroup(i)->total_compressed_size(); + + assert_cast(*column).insertValue(total_compressed_size); + res.addColumn(std::move(column)); + } + /// columns + else if (name == names[6]) + { + auto column = types[6]->createColumn(); + fillColumnsMetadata(metadata, column); + res.addColumn(std::move(column)); + } + /// row_groups + else if (name == names[7]) + { + auto column = types[7]->createColumn(); + fillRowGroupsMetadata(metadata, column); + res.addColumn(std::move(column)); + } + } + + done = true; + return res; +} + +void ParquetMetadataInputFormat::fillColumnsMetadata(const std::shared_ptr & metadata, MutableColumnPtr & column) +{ + auto & array_column = assert_cast(*column); + auto & tuple_column = assert_cast(array_column.getData()); + int32_t num_columns = metadata->num_columns(); + for (int32_t i = 0; i != num_columns; ++i) + { + const auto * column_info = metadata->schema()->Column(i); + /// name + String column_name = column_info->name(); + assert_cast(tuple_column.getColumn(0)).insertData(column_name.data(), column_name.size()); + /// path + String path = column_info->path()->ToDotString(); + assert_cast(tuple_column.getColumn(1)).insertData(path.data(), path.size()); + /// max_definition_level + assert_cast(tuple_column.getColumn(2)).insertValue(column_info->max_definition_level()); + /// max_repetition_level + assert_cast(tuple_column.getColumn(3)).insertValue(column_info->max_repetition_level()); + /// physical_type + std::string_view physical_type = magic_enum::enum_name(column_info->physical_type()); + assert_cast(tuple_column.getColumn(4)).insertData(physical_type.data(), physical_type.size()); + /// logical_type + String logical_type = column_info->logical_type()->ToString(); + assert_cast(tuple_column.getColumn(5)).insertData(logical_type.data(), logical_type.size()); + + if (metadata->num_row_groups() > 0) + { + auto column_chunk_metadata = metadata->RowGroup(0)->ColumnChunk(i); + std::string_view compression = magic_enum::enum_name(column_chunk_metadata->compression()); + assert_cast(tuple_column.getColumn(6)).insertData(compression.data(), compression.size()); + auto & encodings_array_column = assert_cast(tuple_column.getColumn(7)); + auto & encodings_nested_column = assert_cast(encodings_array_column.getData()); + for (auto codec : column_chunk_metadata->encodings()) + { + auto codec_name = magic_enum::enum_name(codec); + encodings_nested_column.insertData(codec_name.data(), codec_name.size()); + } + encodings_array_column.getOffsets().push_back(encodings_nested_column.size()); + } + else + { + String compression = "NONE"; + assert_cast(tuple_column.getColumn(5)).insertData(compression.data(), compression.size()); + tuple_column.getColumn(6).insertDefault(); + } + } + array_column.getOffsets().push_back(tuple_column.size()); +} + +void ParquetMetadataInputFormat::fillRowGroupsMetadata(const std::shared_ptr & metadata, MutableColumnPtr & column) +{ + auto & row_groups_array_column = assert_cast(*column); + auto & row_groups_column = assert_cast(row_groups_array_column.getData()); + for (int32_t i = 0; i != metadata->num_row_groups(); ++i) + { + auto row_group_metadata = metadata->RowGroup(i); + /// num_columns + assert_cast(row_groups_column.getColumn(0)).insertValue(row_group_metadata->num_columns()); + /// num_rows + assert_cast(row_groups_column.getColumn(1)).insertValue(row_group_metadata->num_rows()); + /// total_bytes_size + assert_cast(row_groups_column.getColumn(2)).insertValue(row_group_metadata->total_byte_size()); + /// total_compressed_size + assert_cast(row_groups_column.getColumn(3)).insertValue(row_group_metadata->total_compressed_size()); + /// columns + fillColumnChunksMetadata(row_group_metadata, row_groups_column.getColumn(4)); + } + row_groups_array_column.getOffsets().push_back(row_groups_column.size()); +} + +void ParquetMetadataInputFormat::fillColumnChunksMetadata(const std::unique_ptr & row_group_metadata, IColumn & column) +{ + auto & array_column = assert_cast(column); + auto & tuple_column = assert_cast(array_column.getData()); + for (int32_t column_i = 0; column_i != row_group_metadata->num_columns(); ++column_i) + { + auto column_chunk_metadata = row_group_metadata->ColumnChunk(column_i); + /// name + String column_name = row_group_metadata->schema()->Column(column_i)->name(); + assert_cast(tuple_column.getColumn(0)).insertData(column_name.data(), column_name.size()); + /// path + String path = row_group_metadata->schema()->Column(column_i)->path()->ToDotString(); + assert_cast(tuple_column.getColumn(1)).insertData(path.data(), path.size()); + /// total_compressed_size + assert_cast(tuple_column.getColumn(2)).insertValue(column_chunk_metadata->total_compressed_size()); + /// total_uncompressed_size + assert_cast(tuple_column.getColumn(3)).insertValue(column_chunk_metadata->total_uncompressed_size()); + /// have_statistics + bool have_statistics = column_chunk_metadata->is_stats_set(); + assert_cast(tuple_column.getColumn(4)).insertValue(have_statistics); + if (have_statistics) + fillColumnStatistics(column_chunk_metadata->statistics(), tuple_column.getColumn(5), row_group_metadata->schema()->Column(column_i)->type_length()); + else + tuple_column.getColumn(5).insertDefault(); + } + array_column.getOffsets().push_back(tuple_column.size()); +} + +template +static void getMinMaxNumberStatistics(const std::shared_ptr & statistics, String & min, String & max) +{ + const auto & typed_statistics = dynamic_cast &>(*statistics); + min = std::to_string(typed_statistics.min()); + max = std::to_string(typed_statistics.max()); +} + +void ParquetMetadataInputFormat::fillColumnStatistics(const std::shared_ptr & statistics, IColumn & column, int32_t type_length) +{ + auto & statistics_column = assert_cast(column); + /// num_values + auto & nullable_num_values = assert_cast(statistics_column.getColumn(0)); + assert_cast(nullable_num_values.getNestedColumn()).insertValue(statistics->num_values()); + nullable_num_values.getNullMapData().push_back(0); + + /// null_count + if (statistics->HasNullCount()) + { + auto & nullable_null_count = assert_cast(statistics_column.getColumn(1)); + assert_cast(nullable_null_count.getNestedColumn()).insertValue(statistics->null_count()); + nullable_null_count.getNullMapData().push_back(0); + } + else + { + statistics_column.getColumn(1).insertDefault(); + } + + /// distinct_count + if (statistics->HasDistinctCount()) + { + auto & nullable_distinct_count = assert_cast(statistics_column.getColumn(2)); + size_t distinct_count = statistics->distinct_count(); + /// It can be set but still be 0 because of a bug: https://github.com/apache/arrow/issues/27644 + /// If we see distinct_count = 0 with non 0 values in chunk, set it to NULL. + if (distinct_count == 0 && statistics->num_values() != 0) + { + nullable_distinct_count.insertDefault(); + } + else + { + assert_cast(nullable_distinct_count.getNestedColumn()).insertValue(distinct_count); + nullable_distinct_count.getNullMapData().push_back(0); + } + } + else + { + statistics_column.getColumn(2).insertDefault(); + } + + /// min/max + if (statistics->HasMinMax() && statistics->physical_type() != parquet::Type::type::UNDEFINED) + { + String min; + String max; + switch (statistics->physical_type()) + { + case parquet::Type::type::FLOAT: + { + getMinMaxNumberStatistics(statistics, min, max); + break; + } + case parquet::Type::type::DOUBLE: + { + getMinMaxNumberStatistics(statistics, min, max); + break; + } + case parquet::Type::type::INT32: + { + getMinMaxNumberStatistics(statistics, min, max); + break; + } + case parquet::Type::type::INT64: + { + getMinMaxNumberStatistics(statistics, min, max); + break; + } + case parquet::Type::type::INT96: + { + const auto & int96_statistics = dynamic_cast &>(*statistics); + min = parquet::Int96ToString(int96_statistics.min()); + max = parquet::Int96ToString(int96_statistics.max()); + break; + } + case parquet::Type::type::BOOLEAN: + { + getMinMaxNumberStatistics(statistics, min, max); + break; + } + case parquet::Type::type::BYTE_ARRAY: + { + const auto & byte_array_statistics = dynamic_cast(*statistics); + min = parquet::ByteArrayToString(byte_array_statistics.min()); + max = parquet::ByteArrayToString(byte_array_statistics.max()); + break; + } + case parquet::Type::type::FIXED_LEN_BYTE_ARRAY: + { + const auto & flba_statistics = dynamic_cast(*statistics); + min = parquet::FixedLenByteArrayToString(flba_statistics.min(), type_length); + max = parquet::FixedLenByteArrayToString(flba_statistics.max(), type_length); + break; + } + case parquet::Type::type::UNDEFINED: + { + break; /// unreachable + } + } + + auto & nullable_min = assert_cast(statistics_column.getColumn(3)); + assert_cast(nullable_min.getNestedColumn()).insertData(min.data(), min.size()); + nullable_min.getNullMapData().push_back(0); + auto & nullable_max = assert_cast(statistics_column.getColumn(4)); + assert_cast(nullable_max.getNestedColumn()).insertData(max.data(), max.size()); + nullable_max.getNullMapData().push_back(0); + } + else + { + statistics_column.getColumn(3).insertDefault(); + statistics_column.getColumn(4).insertDefault(); + } +} + +void ParquetMetadataInputFormat::resetParser() +{ + IInputFormat::resetParser(); + done = false; +} + +ParquetMetadataSchemaReader::ParquetMetadataSchemaReader(ReadBuffer & in_) + : ISchemaReader(in_) +{ +} + +NamesAndTypesList ParquetMetadataSchemaReader::readSchema() +{ + return getHeaderForParquetMetadata(); +} + +void registerInputFormatParquetMetadata(FormatFactory & factory) +{ + factory.registerInputFormat( + "ParquetMetadata", + [](ReadBuffer &buf, + const Block &sample, + const RowInputFormatParams &, + const FormatSettings & settings) + { + return std::make_shared(buf, sample, settings); + }); + factory.markFormatSupportsSubcolumns("ParquetMetadata"); + factory.markFormatSupportsSubsetOfColumns("ParquetMetadata"); +} + +void registerParquetMetadataSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "ParquetMetadata", + [](ReadBuffer & buf, const FormatSettings &) + { + return std::make_shared(buf); + } + ); +} + +} + +#else + +namespace DB +{ +class FormatFactory; +void registerInputFormatParquetMetadata(FormatFactory &) +{ +} + +void registerParquetMetadataSchemaReader(FormatFactory &) {} +} + +#endif diff --git a/src/Processors/Formats/Impl/ParquetMetadataInputFormat.h b/src/Processors/Formats/Impl/ParquetMetadataInputFormat.h new file mode 100644 index 00000000000..3561ec6dae8 --- /dev/null +++ b/src/Processors/Formats/Impl/ParquetMetadataInputFormat.h @@ -0,0 +1,90 @@ +#pragma once +#include "config.h" +#if USE_PARQUET + +#include +#include +#include +#include + +namespace parquet::arrow { class FileReader; } + +namespace arrow { class Buffer; class RecordBatchReader;} + +namespace DB +{ + +/* Special format that always returns just one row with Parquet file metadata (see https://parquet.apache.org/docs/file-format/metadata/). + * The result row have the next structure: + * num_columns - the number of columns + * num_rows - the total number of rows + * num_row_groups - the total number of row groups + * format_version - parquet format version, always 1.0 or 2.6 + * total_byte_size - total bytes size of the data, calculated as the sum of total_byte_size from all row groups + * total_compressed_size - total compressed bytes size of the data, calculated as the sum of total_compressed_size from all row groups + * columns - the list of columns metadata with the next structure: + * name - column name + * path - column path (differs from name for nested column) + * max_definition_level - maximum definition level + * max_repetition_level - maximum repetition level + * physical_type - column physical type + * logical_type - column logical type + * compression - compression used for this column + * encodings - the list of encodings used for this column + * row_groups - the list of row groups metadata with the next structure: + * num_columns - the number of columns in the row group + * num_rows - the number of rows in the row group + * total_byte_size - total bytes size of the row group + * total_compressed_size - total compressed bytes size of the row group + * columns - the list of column chunks metadata with the next structure: + * name - column name + * path - column path + * total_compressed_size - total compressed bytes size of the column + * total_uncompressed_size - total uncompressed bytes size of the row group + * have_statistics - bool flag that indicates if column chunk metadata contains column statistics + * statistics - column chunk statistics (all fields are NULL if have_statistics = false) with the next structure: + * num_values - the number of non-null values in the column chunk + * null_count - the number of NULL values in the column chunk + * distinct_count - the number pf distinct values in the column chunk + * min - the minimum value of the column chunk + * max - the maximum column of the column chunk + * */ + +class ParquetMetadataInputFormat : public IInputFormat +{ +public: + ParquetMetadataInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_); + + String getName() const override { return "ParquetMetadataInputFormat"; } + + void resetParser() override; + +private: + Chunk generate() override; + + void onCancel() override + { + is_stopped = 1; + } + + void fillColumnsMetadata(const std::shared_ptr & metadata, MutableColumnPtr & column); + void fillRowGroupsMetadata(const std::shared_ptr & metadata, MutableColumnPtr & column); + void fillColumnChunksMetadata(const std::unique_ptr & row_group_metadata, IColumn & column); + void fillColumnStatistics(const std::shared_ptr & statistics, IColumn & column, int32_t type_length); + + const FormatSettings format_settings; + bool done = false; + std::atomic is_stopped{0}; +}; + +class ParquetMetadataSchemaReader : public ISchemaReader +{ +public: + ParquetMetadataSchemaReader(ReadBuffer & in_); + + NamesAndTypesList readSchema() override; +}; + +} + +#endif diff --git a/tests/queries/0_stateless/02718_parquet_metadata_format.reference b/tests/queries/0_stateless/02718_parquet_metadata_format.reference new file mode 100644 index 00000000000..5ec8b097cea --- /dev/null +++ b/tests/queries/0_stateless/02718_parquet_metadata_format.reference @@ -0,0 +1,154 @@ +{ + "num_columns": "3", + "num_rows": "100000", + "num_row_groups": "2", + "format_version": "2.6", + "total_byte_size": "314147", + "total_compressed_size": "27081", + "columns": [ + { + "name": "number", + "path": "number", + "max_definition_level": "0", + "max_repetition_level": "0", + "physical_type": "INT32", + "logical_type": "Int(bitWidth=16, isSigned=false)", + "compression": "LZ4", + "encodings": [ + "RLE_DICTIONARY", + "PLAIN", + "RLE" + ] + }, + { + "name": "str", + "path": "str", + "max_definition_level": "0", + "max_repetition_level": "0", + "physical_type": "BYTE_ARRAY", + "logical_type": "None", + "compression": "LZ4", + "encodings": [ + "RLE_DICTIONARY", + "PLAIN", + "RLE" + ] + }, + { + "name": "mod", + "path": "mod", + "max_definition_level": "1", + "max_repetition_level": "0", + "physical_type": "INT32", + "logical_type": "Int(bitWidth=8, isSigned=false)", + "compression": "LZ4", + "encodings": [ + "RLE_DICTIONARY", + "PLAIN", + "RLE" + ] + } + ], + "row_groups": [ + { + "num_columns": "3", + "num_rows": "65409", + "total_byte_size": "200527", + "total_compressed_size": "14406", + "columns": [ + { + "name": "number", + "path": "number", + "total_compressed_size": "7070", + "total_uncompressed_size": "85956", + "have_statistics": true, + "statistics": { + "num_values": "65409", + "null_count": "0", + "distinct_count": null, + "min": "0", + "max": "999" + } + }, + { + "name": "str", + "path": "str", + "total_compressed_size": "7093", + "total_uncompressed_size": "93853", + "have_statistics": true, + "statistics": { + "num_values": "65409", + "null_count": "0", + "distinct_count": null, + "min": "Hello0", + "max": "Hello999" + } + }, + { + "name": "mod", + "path": "mod", + "total_compressed_size": "243", + "total_uncompressed_size": "20718", + "have_statistics": true, + "statistics": { + "num_values": "32705", + "null_count": "32704", + "distinct_count": null, + "min": "0", + "max": "8" + } + } + ] + }, + { + "num_columns": "3", + "num_rows": "34591", + "total_byte_size": "113620", + "total_compressed_size": "12675", + "columns": [ + { + "name": "number", + "path": "number", + "total_compressed_size": "6223", + "total_uncompressed_size": "47365", + "have_statistics": true, + "statistics": { + "num_values": "34591", + "null_count": "0", + "distinct_count": null, + "min": "0", + "max": "999" + } + }, + { + "name": "str", + "path": "str", + "total_compressed_size": "6247", + "total_uncompressed_size": "55262", + "have_statistics": true, + "statistics": { + "num_values": "34591", + "null_count": "0", + "distinct_count": null, + "min": "Hello0", + "max": "Hello999" + } + }, + { + "name": "mod", + "path": "mod", + "total_compressed_size": "205", + "total_uncompressed_size": "10993", + "have_statistics": true, + "statistics": { + "num_values": "17295", + "null_count": "17296", + "distinct_count": null, + "min": "0", + "max": "8" + } + } + ] + } + ] +} diff --git a/tests/queries/0_stateless/02718_parquet_metadata_format.sh b/tests/queries/0_stateless/02718_parquet_metadata_format.sh new file mode 100755 index 00000000000..f785abde368 --- /dev/null +++ b/tests/queries/0_stateless/02718_parquet_metadata_format.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_parquet/02718_data.parquet', ParquetMetadata) format JSONEachRow" | python3 -m json.tool diff --git a/tests/queries/0_stateless/data_parquet/02718_data.parquet b/tests/queries/0_stateless/data_parquet/02718_data.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6a930689c83eefbb100f6210a3bab04c7094bfd7 GIT binary patch literal 28165 zcmeI*30%$j{>Smt;uNCLNr*z7LejpFJ%kXl%rG;d2q{a}F*1b4V2pieOh`g#X0nbY zq_QO>A*5YG$iAHa>vv{c^PQP{r~iGp_s%>X=RD?_^ZC8M51oF$&d;gO?{mhtV?PIl zt|FTM7096neN;s?REGg-pe77qgj(S5lhuJS z6fi+un4%u)qXEp&5RG6C3p7R(G=(LspoBGS&C&viMD8m_UM3)=!DMjLKk#JH+Z8vdY~tK&1Ho z@ew}8C-BEG495rrAP^%l3PBi+F&K+rj6(=QF&+~z5t9&x$(Vwvn1<u7Gnv%!cu&VWmt|CScz3wjWzfNYq1Vd_!jH&9ip)T z8?gy7*o;_gK^(SX8@3}JJFpYGkbv*;1Aaszc4H6rVjuS70Di(j9KvB7K@yJQ7>*+u zCy;_vq~Rn^;WWWxftD8M0O&v$RUn5R z^idVnP#p%SftoOc5o)0}>cAKZn4m69Q4jUe0A^^2Mlgp38lwrC!V*?c!WuSchURDi zTeO57TEQL;aD)?_;R08;kDAdJQsj72cUAq1fqj|rHF zNeIJaOu@aXDZa)sEXNA0 z#44=D8hnGbScfQli}m;p(b#~E*n}8tMl7}<4qLGe+Yyf)*oj?8!1wq8KOzyku?Ksx z5BqTdKj9z_;V_OM2}f}Z$B~Q^NI@#na1y6*8fS18={ScBoW})RL?$jF3zw0NE4Yel z$ia2oz)j@h7V?mf0u-VM#VEmTl;RG`a2NM*ALV#}hj@evJjN3|#WOs|3%tZDyv7@- zApPU71#Rd+7gZpK9`sQa)leMT@(Ew&>h(<7n1sbCX zn!*xRP{JBEXolu!0b8_$9a_O24se7MoZ$jjxWOGBXpJ`TL|e2&dvriYbV6r%p$odA z8@$mSJT^ z7>q?Q#vufu7>@~;WK6+SOv7}{z^9mr&oB$&n2pad2XhgDd6VJW`GGAzdmti&p;#u|KswOEHJe2ew?4$;_vjo5@3Y(^}$AP!rx4cifq9oUIo zNWk~_0Y4%UyRip*u@C!k06*a%4&gA4APGlt49Ag-6G%ZS(r^-|a2jWD7U?*L44lUW zTtp@=Aq$t0jVri{YskTM+`vub;ui9dj{+2;2*oJDZIt2;%5WF=a3AG(fQNX53OvRW zJjF9S#|yl~E4;=Vs2*1@=imROS~@)cp#xo1fgF0!M^#iqbr_%qYQhjksD;|717j#) zg1Rt8J=8}7n4uvW!5kK7j3#IbOISe(YuKO}nxh45(GqrO1$#Ka5l(Q13tZs_cX*&R z+Q1WS(GKm=0Ugl^o#BNp=!$OeMtAf;Pxzo0dZQ0~(HH&DAAT5sff$6r_y8Yb2!`S# ze2h=vk6{>&5ePsaMq(6#FdAbp7Qq;Y5QJhpCSW2aAq0+z z1*u5GNu0uIoWWV7;~X+@9v5&CnYe^3Tt+so;3}>m2iI`}H<625$U{B~P>3QFLm^XK zRaI5Iva?mpd84N=Qpgo@Ggq07t`wT;@(;Lldm_D#zgjI^rfZQT(>3-_($cq#PSQ4T z&P~!W^0GLpYwYKLw2Em!^ijE4Sng3hiwKKj`j#vFk5#pai9S}%HZk{Db^BzC;|9)| z{>N*$7epVg=~E=$Z$Z&RM!^qgl!=7@7`pA~F6 z*X~tC{<-$DDwY`?bQJ*^9rdkZGCCQ!6l8QZ>SB4`%XmP*`7Wk`G3UFQO)fa!&0?PA z1#ioh0T;U4Y>v6m!*+MUg`V~&EHC;vUkbR`%e^q>VsFpK1sD5tsA84r>!k?H?CWi{ zIkTUSOJQbz-!4{{{QL$4UK%hsaPy^sLnjwr8stCEDr<1S%D}7-f;Ml?`Y?ERVb+lG zC#)_H4Z9S0`J-utn=gMn^Ks$jPi9w9X8T7dMrIF-w2I9hzR0C0d&JT%$}0gY28_HC zxF#_6%E+k6MOQ{e&r@Cvidi}G>gc%5u~)~$?=HGJHsOTwT5#f}k=MrUD~!Dsa`17{ zwa}z0);Z&o6{B({q*-mrnRv#fIA>Bu7whX`nFB^$pPU`I<@%JI$;H>F=FYReF|A(m2tUq z3|vcc=NfgjxfNkNFzD7i(~)tv=9^6^xwXJzzD-`F<*J~(FKlAt@)p|eDarfNKE)<~ zk#km1{$lr{xcnuaPfGH?>L71cu+++l8zB=Qk@_6R>J@(KkV{TZ`5P@3~#HZhT6!;;68!(Z%0RE81GTe&&&(vnR}yEeZavtrsb_vB2ub9Zm<{1*526|5S2Z+}Vb_In4)_T0JmQ+Y~@`v)ts z#@;{ltZ4iF!>^v)xqn2aXIq}6TQ|7;sJ?Z4`7r~xvhw3b-E1Eu8xIP8aKdy{{DTy; zsbvpREf&~5OtV}a{P3jBmiULKZ1t@DD?H?>`}X3KgpST z|Mk<{1+CsZD_9-+=6SxZd3b@YN;O2DuWKmX8Oj~=@U*HbRbP35t~|BsuL>!Nuc=aX zW2KgQomefcqf)gctFF$(D$G=+aw1cxGqD=nP^D@{X0Fb}%Cm_|Wlq*ior#r&wMwNR zb5dtwrE9BFRVQn!&cv$3UZv6^>!!}c>Wqu(2~l5lBoqm)Rb?d7o-z*?iWTiuc_e}A zL@1_tsj^8z)rn9H?XF5EnXXRW9s%?44!u>$Y@Vy${Ewp`eMZavsskj8)rq*@^#`jG zh*qm3b+NS0bT<5a9Lk2OVktJLQ=!Nhrivoju1Kb6l=z+B1w*@6QP(n zUNwv4q&g7_|H-N_l8fp@D7=bfFQv#w9SKE6;sYres7{0;dw;Q% z_^T74IC%KFl#Ed)+)azxrmDVe2Cq=K(nI&?pu2ZGKXkO?4F?n}=SRk}Li&M!M5vLaO4oUPt073_#~>0~H94TGNtN@3pb3*R?Gq@+xp{I-w> zKbJ>V7%r$#<*E0w{NQ0j+wBUucdR^MgM4L(Ie9-ZSLRQw~cv!_=x;=7CgKq~DzWd{ZWem?^aDxfsdh^-SdVcp>ng*Ib(P`Hy2|ii-Jfvv@X)`ZQ`g==R@VWJaDp>j;0iam8_4vgv80k5 zSyvfOtg8%X)>VcJ>ng*Qb(P`9y2@~8U1fN%?oa3?=qerUr^)^wAIjO;3A+C)){f4( zT0H9i)#v|bpV!*a&0FVvKQ1S}PL2QQK2SQj+f=FaPIl&iX#6GifzsK71Lwj4(fI4^ z16PiR#$RV2xN$r*{yO`>o#UbL*VzXi9FM;NrH6AulfT0Q{dXUuf`i~29O~iW@z2_R z`syF%Szs#v`Nw*GzSlJWG|fLv^H0jnt$*5UYh0~-;|o>pLw|0?h`Aq-6vLJyHBjdcAr>@ z?LM)3+w!Yy^Y8EtEi;9|FWo<#NnvkWe+Ld!Hz^HKmhRM2W`;`5zu&+1|HBrbjl$MP zQB6JNS6)SGFiNdK=|VxN7pM<@wn#gstw`4vE-9R?(y`H%;(uIJ7&^&&(M<%QjR=l$nQ5*V@vD z&Ea<1>#I!H>chJ5huSgP$uh$|I{)gg|4o)>J}J_6xo>0;Uzai|pB?zNeX27qG~k!s zY0hzNlOi4E##}As)tg#ouD|^X|8VayCYkfDfBJSrN~LP88Dh~4vDD|O)Bt8^h(<7n z1sbCXn!*xRP{JBEXolu!0b8_$9a_O24se7MoZ$jjxWOGBXpJ`TL|e2&dvriYbV6r% zp$odA8@$mSJT^7>q?Q#vufu7>@~;WK6+SOv7}{z^9mr&oB$&n2pad2XhgDd6VJW`GGAzdmti&p;#u|KswOEHJe2ew?4$;_vjo5@3Y(^}$AP!rx4cifq z9oUIoNWk~_0Y4%UyRip*u@C!k06*a%4&gA4APGlt49Ag-6G%ZS(r^-|a2jWD7U?*L z44lUWTtp@=Aq$t0jVri{YskTM+`vub;ui9dj{+2;2*oJDZIt2;%5WF=a3AG(fQNX5 z3OvRWJjF9S#|yl~E4;=Vs37B7cr9o{2fC;NIrN~9s;GwQFhC8|gdvPj3$;-P#!$cn zbzzEnsE-CPLqjxzIV{i^P0$pUu!0iSut764M+?}ZCG5}&_HckBoZt)>xWWzY@IY&{ zfhXFc9onM_P&*#}D`siP()j*o%GGj|2D#2XP38aRf;?ieosAWSl?> zQjvy}IEB+VgR@A-Ib`5GF5n_EaS2(tjBH%NRa`?3uHy!7A{V!ihkO*E5Jf14blajQ z&6;uZl}THq7o^gwQ|U#i9;8>ERZ$JqVSpN_2}2m67HXpojG=%D>cSNDP#+CohK6Va zb6B7;nxH8xVFe|uVS{F9jux;*OW2_m?BM`MIKde%aD^M(;epm@15dO?JG4g!bVMg~ zh8Mb^E4slO-O&R*;e%f2jXv;2U-UzN_+bDBVh{%71AK@f7>bYZF+PDmhG95HAOL|F ziBSl`XpF&F1Y;aR5Q_1bfQgudFigf2OvN-z#|(UmnfMH|5RTdS9CI)i5txVhSb#`; zfra=Ii?A3=@D-NgYb?WZtiVdF!fLF+H&}~xh{Cs6kM9tT4cLfHh{0yWVhiH172B{K z@z{Z#*o6dqk00vuu6dsc_Q7hb0l(PD ziK(K*PQVK`DKgyBe`5QZa#LKuz|3Sl@>D1`o~Pze1|p%D6`LLu}=g+gqPij~+b z6)SOCQmn*rNwE^gC55W*FXNH}eHm1@lqoAtXus=Byc_J|#fiznf+{BKhbc2RX)Ch| zXikXl=LLj zNLs^;kkZvbSNl(gg(@>7B(15;kdUdk! z8UMe(4>XtD|8*|8myU(zlKa2bKG0lp|JS+XUOEQ@}RDK|+UrrbGSN4Y3f`#q1Nq;Jzw z%wZhmS66LBFVX((I7+ixTuQd>kH=9CnfFy08LM=fzvuCjNb_c1CYHYHi5BIu9qgOm z44O#$CU3iz$yyy{(l#CCv#mO2uD^Zkf7H#X=i?$Xl}{QG6gX~}LeAUxSyBG{)_+!} zKJj&oYSZ0O7_e0u36gScQ)#xv%7jo~3QdyM8q~9`=lQnJ$2#B{{Zg)0C2T2@c;k- literal 0 HcmV?d00001