From f76df37cfa03fe3df2f7f08756a52be5afa02e88 Mon Sep 17 00:00:00 2001 From: divanik Date: Mon, 18 Nov 2024 17:17:26 +0000 Subject: [PATCH] Add getAllFilesMask --- src/CMakeLists.txt | 1 + .../ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp | 4 +++- .../ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h | 8 +++++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3627d760d4c..2ddc1fb3e69 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -135,6 +135,7 @@ add_headers_and_sources(dbms Storages/ObjectStorage/S3) add_headers_and_sources(dbms Storages/ObjectStorage/HDFS) add_headers_and_sources(dbms Storages/ObjectStorage/Local) add_headers_and_sources(dbms Storages/ObjectStorage/DataLakes) +add_headers_and_sources(dbms Storages/ObjectStorage/DataLakes/Iceberg) add_headers_and_sources(dbms Common/NamedCollections) add_headers_and_sources(dbms Common/Scheduler/Workload) diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp index 5af933eec2a..6148e8103ac 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp @@ -1,5 +1,6 @@ #include "config.h" + #if USE_AVRO # include @@ -88,6 +89,7 @@ enum class DataFileContent : uint8_t EQUALITY_DELETES = 2, }; + /** * Iceberg supports the next data types (see https://iceberg.apache.org/spec/#schemas-and-data-types): * - Primitive types: @@ -307,7 +309,7 @@ parseTableSchema(const Poco::JSON::Object::Ptr & metadata_object, int format_ver /// Field "schemas" is optional for version 1, but after version 2 was introduced, /// in most cases this field is added for new tables in version 1 as well. if (!ignore_schema_evolution && metadata_object->has("schemas") - && metadata_object->get("schemas").extract()->size() > 1) + && metadata_object->get("schemas").extract()->size() > 1ยง) throw Exception( ErrorCodes::UNSUPPORTED_METHOD, "Cannot read Iceberg table: the table schema has been changed at least 1 time, reading tables with evolved schema is not " diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h index b02a60603ab..94a5acf28b5 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h @@ -88,7 +88,7 @@ struct SpecificSchemaPartitionInfo class PartitionPruningProcessor { public: - CommonPartitionInfo getCommonPartitionInfo(Poco::JSON::Array::Ptr partition_specification, const ColumnTuple * big_partition_tuple); + CommonPartitionInfo addCommonPartitionInfo(Poco::JSON::Array::Ptr partition_specification, const ColumnTuple * big_partition_tuple); SpecificSchemaPartitionInfo getSpecificPartitionPruning( const CommonPartitionInfo & common_info, @@ -97,6 +97,8 @@ public: std::vector getPruningMask(const SpecificSchemaPartitionInfo & specific_info, const ActionsDAG * filter_dag, ContextPtr context); + std::vector getAllFilesMask(const ActionsDAG * filter_dag, ContextPtr context); + private: static PartitionTransform getTransform(const String & transform_name) { @@ -189,6 +191,8 @@ private: std::unordered_map common_partition_info_by_manifest_file; std::map, SpecificSchemaPartitionInfo> specific_partition_info_by_manifest_file_and_schema; + + std::vector common_partition_infos; }; @@ -259,6 +263,8 @@ private: mutable Strings data_files; mutable Strings manifest_files; + + PartitionPruningProcessor pruning_processor; }; }