From bbeb3f61e5879efabac5879e284f5b11963dffc1 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Wed, 20 Nov 2024 13:04:04 +0100
Subject: [PATCH 01/41] init

---
 src/Interpreters/QueryAliasesVisitor.cpp | 20 ++++++++++++++++++--
 src/Interpreters/QueryAliasesVisitor.h   |  1 +
 2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/src/Interpreters/QueryAliasesVisitor.cpp b/src/Interpreters/QueryAliasesVisitor.cpp
index 8f96044abec..9c3492587ec 100644
--- a/src/Interpreters/QueryAliasesVisitor.cpp
+++ b/src/Interpreters/QueryAliasesVisitor.cpp
@@ -114,17 +114,33 @@ void QueryAliasesMatcher<T>::visit(const ASTSubquery & const_subquery, const AST
     subquery.prefer_alias_to_column_name = true;
 }
 
+template <typename T>
+bool QueryAliasesMatcher<T>::checkIfNamesAreSame(Data & data, const ASTPtr & ast)
+{
+    String name = ast->getColumnName();
+    for (const auto & obj : data)
+    {
+        if (obj.second->getColumnName() == name)
+            return true;
+    }
+    return false;
+}
+
 template <typename T>
 void QueryAliasesMatcher<T>::visitOther(const ASTPtr & ast, Data & data)
 {
     auto & aliases = data;
     String alias = ast->tryGetAlias();
+
     if (!alias.empty())
     {
         if (aliases.contains(alias) && ast->getTreeHash(/*ignore_aliases=*/ true) != aliases[alias]->getTreeHash(/*ignore_aliases=*/ true))
-            throw Exception(wrongAliasMessage(ast, aliases[alias], alias), ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS);
+        {
+            if (checkIfNamesAreSame(aliases, ast))
+                throw Exception(wrongAliasMessage(ast, aliases[alias], alias), ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS);
+        }
 
-        aliases[alias] = ast;
+        aliases[ast->getColumnNameWithoutAlias()] = ast;
     }
 
     /** QueryAliasesVisitor is executed before ExecuteScalarSubqueriesVisitor.
diff --git a/src/Interpreters/QueryAliasesVisitor.h b/src/Interpreters/QueryAliasesVisitor.h
index 6e79cfc77be..3cee3b2fc64 100644
--- a/src/Interpreters/QueryAliasesVisitor.h
+++ b/src/Interpreters/QueryAliasesVisitor.h
@@ -38,6 +38,7 @@ private:
     static void visit(const ASTSubquery & subquery, const ASTPtr & ast, Data & data);
     static void visit(const ASTArrayJoin &, const ASTPtr & ast, Data & data);
     static void visitOther(const ASTPtr & ast, Data & data);
+    static bool checkIfNamesAreSame(Data & data, const ASTPtr & ast);
 };
 
 /// Visits AST nodes and collect their aliases in one map (with links to source nodes).

From e0fb20f6d41c8fa0f3ecacbd77121b58cd38ae56 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Wed, 20 Nov 2024 13:45:23 +0100
Subject: [PATCH 02/41] Fix.

---
 src/Interpreters/QueryAliasesVisitor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/QueryAliasesVisitor.cpp b/src/Interpreters/QueryAliasesVisitor.cpp
index 9c3492587ec..23c81db9af0 100644
--- a/src/Interpreters/QueryAliasesVisitor.cpp
+++ b/src/Interpreters/QueryAliasesVisitor.cpp
@@ -140,7 +140,7 @@ void QueryAliasesMatcher<T>::visitOther(const ASTPtr & ast, Data & data)
                 throw Exception(wrongAliasMessage(ast, aliases[alias], alias), ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS);
         }
 
-        aliases[ast->getColumnNameWithoutAlias()] = ast;
+        aliases[alias] = ast;
     }
 
     /** QueryAliasesVisitor is executed before ExecuteScalarSubqueriesVisitor.

From 7a30ea2e48ac49620968cb3cf8dc5d39e539ace2 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Wed, 20 Nov 2024 19:48:10 +0100
Subject: [PATCH 03/41] Normalize the query before parsing aliases.

---
 .../Resolve/QueryExpressionsAliasVisitor.h    |  2 +-
 .../UserDefinedSQLFunctionVisitor.cpp         | 29 ++++++++++++++-----
 .../UserDefinedSQLFunctionVisitor.h           |  8 ++---
 src/Interpreters/InterpreterAlterQuery.cpp    |  2 +-
 src/Interpreters/InterpreterCreateQuery.cpp   |  2 +-
 src/Interpreters/QueryAliasesVisitor.h        |  1 -
 src/Interpreters/TreeRewriter.cpp             |  2 +-
 7 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/Analyzer/Resolve/QueryExpressionsAliasVisitor.h b/src/Analyzer/Resolve/QueryExpressionsAliasVisitor.h
index 9824f4a2570..b4c47b306ae 100644
--- a/src/Analyzer/Resolve/QueryExpressionsAliasVisitor.h
+++ b/src/Analyzer/Resolve/QueryExpressionsAliasVisitor.h
@@ -96,7 +96,7 @@ private:
 
             auto [_, inserted] = aliases.alias_name_to_lambda_node.insert(std::make_pair(alias, node));
             if (!inserted)
-             addDuplicatingAlias(node);
+                addDuplicatingAlias(node);
 
             return;
         }
diff --git a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
index a04b8d7b998..2ec62b9ce8f 100644
--- a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
+++ b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
@@ -11,7 +11,9 @@
 #include <Parsers/ASTExpressionList.h>
 #include <Parsers/ASTIdentifier.h>
 #include <Functions/UserDefined/UserDefinedSQLFunctionFactory.h>
-#include "Parsers/ASTColumnDeclaration.h"
+#include <Interpreters/QueryAliasesVisitor.h>
+#include <Interpreters/MarkTableIdentifiersVisitor.h>
+#include <Interpreters/QueryNormalizer.h>
 
 
 namespace DB
@@ -22,14 +24,14 @@ namespace ErrorCodes
     extern const int UNSUPPORTED_METHOD;
 }
 
-void UserDefinedSQLFunctionVisitor::visit(ASTPtr & ast)
+void UserDefinedSQLFunctionVisitor::visit(ASTPtr & ast, ContextPtr context_)
 {
     chassert(ast);
 
     if (const auto * function = ast->template as<ASTFunction>())
     {
         std::unordered_set<std::string> udf_in_replace_process;
-        auto replace_result = tryToReplaceFunction(*function, udf_in_replace_process);
+        auto replace_result = tryToReplaceFunction(*function, udf_in_replace_process, context_);
         if (replace_result)
             ast = replace_result;
     }
@@ -40,7 +42,7 @@ void UserDefinedSQLFunctionVisitor::visit(ASTPtr & ast)
             return;
 
         auto * old_ptr = child.get();
-        visit(child);
+        visit(child, context_);
         auto * new_ptr = child.get();
 
         /// Some AST classes have naked pointers to children elements as members.
@@ -50,16 +52,16 @@ void UserDefinedSQLFunctionVisitor::visit(ASTPtr & ast)
     }
 }
 
-void UserDefinedSQLFunctionVisitor::visit(IAST * ast)
+void UserDefinedSQLFunctionVisitor::visit(IAST * ast, ContextPtr context_)
 {
     if (!ast)
         return;
 
     for (auto & child : ast->children)
-        visit(child);
+        visit(child, context_);
 }
 
-ASTPtr UserDefinedSQLFunctionVisitor::tryToReplaceFunction(const ASTFunction & function, std::unordered_set<std::string> & udf_in_replace_process)
+ASTPtr UserDefinedSQLFunctionVisitor::tryToReplaceFunction(const ASTFunction & function, std::unordered_set<std::string> & udf_in_replace_process, ContextPtr context_)
 {
     if (udf_in_replace_process.find(function.name) != udf_in_replace_process.end())
         throw Exception(ErrorCodes::UNSUPPORTED_METHOD,
@@ -101,6 +103,17 @@ ASTPtr UserDefinedSQLFunctionVisitor::tryToReplaceFunction(const ASTFunction & f
 
     auto function_body_to_update = function_core_expression->children.at(1)->clone();
 
+    Aliases aliases;
+    QueryAliasesVisitor(aliases).visit(function_body_to_update);
+
+    /// Mark table ASTIdentifiers with not a column marker
+    MarkTableIdentifiersVisitor::Data identifiers_data{aliases};
+    MarkTableIdentifiersVisitor(identifiers_data).visit(function_body_to_update);
+
+    /// Common subexpression elimination. Rewrite rules.
+    QueryNormalizer::Data normalizer_data(aliases, {}, true, context_->getSettingsRef(), true, false);
+    QueryNormalizer(normalizer_data).visit(function_body_to_update);
+
     auto expression_list = std::make_shared<ASTExpressionList>();
     expression_list->children.emplace_back(std::move(function_body_to_update));
 
@@ -116,7 +129,7 @@ ASTPtr UserDefinedSQLFunctionVisitor::tryToReplaceFunction(const ASTFunction & f
         {
             if (auto * inner_function = child->as<ASTFunction>())
             {
-                auto replace_result = tryToReplaceFunction(*inner_function, udf_in_replace_process);
+                auto replace_result = tryToReplaceFunction(*inner_function, udf_in_replace_process, context_);
                 if (replace_result)
                     child = replace_result;
             }
diff --git a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h
index c8cbf396707..36bdfc1499d 100644
--- a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h
+++ b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h
@@ -19,13 +19,13 @@ class ASTFunction;
   * After applying visitor:
   * SELECT number + 1 FROM system.numbers LIMIT 10;
   */
-class UserDefinedSQLFunctionVisitor
+class UserDefinedSQLFunctionVisitor : WithContext
 {
 public:
-    static void visit(ASTPtr & ast);
+    static void visit(ASTPtr & ast, ContextPtr context_);
 private:
-    static void visit(IAST *);
-    static ASTPtr tryToReplaceFunction(const ASTFunction & function, std::unordered_set<std::string> & udf_in_replace_process);
+    static void visit(IAST *, ContextPtr context_);
+    static ASTPtr tryToReplaceFunction(const ASTFunction & function, std::unordered_set<std::string> & udf_in_replace_process, ContextPtr context_);
 
 };
 
diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp
index 907026c73a3..1bf62387307 100644
--- a/src/Interpreters/InterpreterAlterQuery.cpp
+++ b/src/Interpreters/InterpreterAlterQuery.cpp
@@ -99,7 +99,7 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter)
     BlockIO res;
 
     if (!UserDefinedSQLFunctionFactory::instance().empty())
-        UserDefinedSQLFunctionVisitor::visit(query_ptr);
+        UserDefinedSQLFunctionVisitor::visit(query_ptr, getContext());
 
     auto table_id = getContext()->tryResolveStorageID(alter);
     StoragePtr table;
diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp
index ff0e1d7f5a8..bf2c33f6020 100644
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@@ -1599,7 +1599,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create)
 
     // substitute possible UDFs with their definitions
     if (!UserDefinedSQLFunctionFactory::instance().empty())
-        UserDefinedSQLFunctionVisitor::visit(query_ptr);
+        UserDefinedSQLFunctionVisitor::visit(query_ptr, getContext());
 
     /// Set and retrieve list of columns, indices and constraints. Set table engine if needed. Rewrite query in canonical way.
     TableProperties properties = getTablePropertiesAndNormalizeCreateQuery(create, mode);
diff --git a/src/Interpreters/QueryAliasesVisitor.h b/src/Interpreters/QueryAliasesVisitor.h
index 3cee3b2fc64..6e79cfc77be 100644
--- a/src/Interpreters/QueryAliasesVisitor.h
+++ b/src/Interpreters/QueryAliasesVisitor.h
@@ -38,7 +38,6 @@ private:
     static void visit(const ASTSubquery & subquery, const ASTPtr & ast, Data & data);
     static void visit(const ASTArrayJoin &, const ASTPtr & ast, Data & data);
     static void visitOther(const ASTPtr & ast, Data & data);
-    static bool checkIfNamesAreSame(Data & data, const ASTPtr & ast);
 };
 
 /// Visits AST nodes and collect their aliases in one map (with links to source nodes).
diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp
index 16b0e7ef199..7f2b766e712 100644
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@@ -1571,7 +1571,7 @@ void TreeRewriter::normalize(
     ASTPtr & query, Aliases & aliases, const NameSet & source_columns_set, bool ignore_alias, const Settings & settings, bool allow_self_aliases, ContextPtr context_, bool is_create_parameterized_view)
 {
     if (!UserDefinedSQLFunctionFactory::instance().empty())
-        UserDefinedSQLFunctionVisitor::visit(query);
+        UserDefinedSQLFunctionVisitor::visit(query, context_);
 
     CustomizeCountDistinctVisitor::Data data_count_distinct{settings[Setting::count_distinct_implementation]};
     CustomizeCountDistinctVisitor(data_count_distinct).visit(query);

From ec9366e4e44f3e8eadca3237137b3972c77ad330 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Wed, 20 Nov 2024 19:55:06 +0100
Subject: [PATCH 04/41] Add tests.

---
 .../03274_aliases_in_udf.reference            |  5 +++
 .../0_stateless/03274_aliases_in_udf.sql      | 32 +++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 tests/queries/0_stateless/03274_aliases_in_udf.reference
 create mode 100644 tests/queries/0_stateless/03274_aliases_in_udf.sql

diff --git a/tests/queries/0_stateless/03274_aliases_in_udf.reference b/tests/queries/0_stateless/03274_aliases_in_udf.reference
new file mode 100644
index 00000000000..eb17ca9feba
--- /dev/null
+++ b/tests/queries/0_stateless/03274_aliases_in_udf.reference
@@ -0,0 +1,5 @@
+FIX ISSUE #69143
+a
+b
+EXPAIN SYNTAX OF UDF
+SELECT ((4 + 2) + 1, ((4 + 2) + 1) + 2)
\ No newline at end of file
diff --git a/tests/queries/0_stateless/03274_aliases_in_udf.sql b/tests/queries/0_stateless/03274_aliases_in_udf.sql
new file mode 100644
index 00000000000..cf53ec4b6cb
--- /dev/null
+++ b/tests/queries/0_stateless/03274_aliases_in_udf.sql
@@ -0,0 +1,32 @@
+SELECT "FIX ISSUE #69143"
+
+CREATE OR REPLACE FUNCTION test_function AS ( input_column_name ) -> ((
+        '1' AS a,
+        input_column_name AS input_column_name
+    ).2);
+
+CREATE TABLE test_table
+(
+    `metadata_a` String,
+    `metadata_b` String
+)
+ENGINE = MergeTree()
+ORDER BY tuple();
+
+
+ALTER TABLE test_table ADD COLUMN mat_a String MATERIALIZED test_function(metadata_a);
+ALTER TABLE test_table MATERIALIZE COLUMN `mat_a`;
+
+ALTER TABLE test_table ADD COLUMN mat_b String MATERIALIZED test_function(metadata_b);
+ALTER TABLE test_table MATERIALIZE COLUMN `mat_b`;
+
+INSERT INTO test_table SELECT 'a', 'b';
+
+SELECT mat_a FROM test_table;
+SELECT mat_b FROM test_table;
+
+SELECT "EXPAIN SYNTAX OF UDF";
+
+CREATE OR REPLACE FUNCTION test_03274 AS ( x ) -> ((x + 1 as y, y + 2));
+
+EXPAIN SYNTAX SELECT test_03274(4 + 2);

From a7c3acd90cd119fc37954cad8b19c7bf40d9c96f Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Wed, 20 Nov 2024 19:57:15 +0100
Subject: [PATCH 05/41] Minor clarification.

---
 src/Interpreters/QueryNormalizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/QueryNormalizer.cpp b/src/Interpreters/QueryNormalizer.cpp
index bba30fb5194..8f85ee22682 100644
--- a/src/Interpreters/QueryNormalizer.cpp
+++ b/src/Interpreters/QueryNormalizer.cpp
@@ -85,10 +85,10 @@ void QueryNormalizer::visit(ASTIdentifier & node, ASTPtr & ast, Data & data)
     }
 
     /// If it is an alias, but not a parent alias (for constructs like "SELECT column + 1 AS column").
-    auto it_alias = data.aliases.find(node.name());
     if (!data.allow_self_aliases && current_alias == node.name())
         throw Exception(ErrorCodes::CYCLIC_ALIASES, "Self referencing of {} to {}. Cyclic alias",
                         backQuote(current_alias), backQuote(node.name()));
+    auto it_alias = data.aliases.find(node.name());
 
     if (it_alias != data.aliases.end() && current_alias != node.name())
     {

From e81c5171b78a659726ac6380a6c5f5e70acfbe1d Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Wed, 20 Nov 2024 20:10:04 +0100
Subject: [PATCH 06/41] Small fixup.

---
 src/Interpreters/QueryAliasesVisitor.cpp | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/Interpreters/QueryAliasesVisitor.cpp b/src/Interpreters/QueryAliasesVisitor.cpp
index 23c81db9af0..85335c609e5 100644
--- a/src/Interpreters/QueryAliasesVisitor.cpp
+++ b/src/Interpreters/QueryAliasesVisitor.cpp
@@ -114,31 +114,15 @@ void QueryAliasesMatcher<T>::visit(const ASTSubquery & const_subquery, const AST
     subquery.prefer_alias_to_column_name = true;
 }
 
-template <typename T>
-bool QueryAliasesMatcher<T>::checkIfNamesAreSame(Data & data, const ASTPtr & ast)
-{
-    String name = ast->getColumnName();
-    for (const auto & obj : data)
-    {
-        if (obj.second->getColumnName() == name)
-            return true;
-    }
-    return false;
-}
-
 template <typename T>
 void QueryAliasesMatcher<T>::visitOther(const ASTPtr & ast, Data & data)
 {
     auto & aliases = data;
     String alias = ast->tryGetAlias();
-
     if (!alias.empty())
     {
         if (aliases.contains(alias) && ast->getTreeHash(/*ignore_aliases=*/ true) != aliases[alias]->getTreeHash(/*ignore_aliases=*/ true))
-        {
-            if (checkIfNamesAreSame(aliases, ast))
                 throw Exception(wrongAliasMessage(ast, aliases[alias], alias), ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS);
-        }
 
         aliases[alias] = ast;
     }

From fedbdaa29c56c23e6ecb8290a7dc04da8e1f3420 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Wed, 20 Nov 2024 20:22:36 +0100
Subject: [PATCH 07/41] Fix build

---
 src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h
index 36bdfc1499d..633f68db348 100644
--- a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h
+++ b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h
@@ -19,7 +19,7 @@ class ASTFunction;
   * After applying visitor:
   * SELECT number + 1 FROM system.numbers LIMIT 10;
   */
-class UserDefinedSQLFunctionVisitor : WithContext
+class UserDefinedSQLFunctionVisitor
 {
 public:
     static void visit(ASTPtr & ast, ContextPtr context_);

From c738edf8bbb269b72c6ca308b78bc20ee7dc48b5 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Wed, 20 Nov 2024 20:45:58 +0100
Subject: [PATCH 08/41] Add a semicolon.

---
 tests/queries/0_stateless/03274_aliases_in_udf.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/03274_aliases_in_udf.sql b/tests/queries/0_stateless/03274_aliases_in_udf.sql
index cf53ec4b6cb..92fa8492177 100644
--- a/tests/queries/0_stateless/03274_aliases_in_udf.sql
+++ b/tests/queries/0_stateless/03274_aliases_in_udf.sql
@@ -1,4 +1,4 @@
-SELECT "FIX ISSUE #69143"
+SELECT "FIX ISSUE #69143";
 
 CREATE OR REPLACE FUNCTION test_function AS ( input_column_name ) -> ((
         '1' AS a,

From ed285f47dc2b47fada0343ed69b7c3d91f7a2915 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Wed, 20 Nov 2024 23:14:19 +0100
Subject: [PATCH 09/41] Update 03274_aliases_in_udf.sql

---
 tests/queries/0_stateless/03274_aliases_in_udf.sql | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/queries/0_stateless/03274_aliases_in_udf.sql b/tests/queries/0_stateless/03274_aliases_in_udf.sql
index 92fa8492177..c909990d7ce 100644
--- a/tests/queries/0_stateless/03274_aliases_in_udf.sql
+++ b/tests/queries/0_stateless/03274_aliases_in_udf.sql
@@ -1,4 +1,4 @@
-SELECT "FIX ISSUE #69143";
+SELECT 'FIX ISSUE #69143';
 
 CREATE OR REPLACE FUNCTION test_function AS ( input_column_name ) -> ((
         '1' AS a,
@@ -25,7 +25,7 @@ INSERT INTO test_table SELECT 'a', 'b';
 SELECT mat_a FROM test_table;
 SELECT mat_b FROM test_table;
 
-SELECT "EXPAIN SYNTAX OF UDF";
+SELECT 'EXPAIN SYNTAX OF UDF';
 
 CREATE OR REPLACE FUNCTION test_03274 AS ( x ) -> ((x + 1 as y, y + 2));
 

From 9cfee9cddbf90ca2cc2ed15803a150abe2e7ca87 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Thu, 21 Nov 2024 00:55:26 +0100
Subject: [PATCH 10/41] Update 03274_aliases_in_udf.reference

---
 tests/queries/0_stateless/03274_aliases_in_udf.reference | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/03274_aliases_in_udf.reference b/tests/queries/0_stateless/03274_aliases_in_udf.reference
index eb17ca9feba..2d51e7becae 100644
--- a/tests/queries/0_stateless/03274_aliases_in_udf.reference
+++ b/tests/queries/0_stateless/03274_aliases_in_udf.reference
@@ -2,4 +2,4 @@ FIX ISSUE #69143
 a
 b
 EXPAIN SYNTAX OF UDF
-SELECT ((4 + 2) + 1, ((4 + 2) + 1) + 2)
\ No newline at end of file
+SELECT ((4 + 2) + 1, ((4 + 2) + 1) + 2)

From 7dcaacab9daa7dd491dde5fcaf698d2d85a21d85 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Thu, 21 Nov 2024 01:00:37 +0100
Subject: [PATCH 11/41] Update 03274_aliases_in_udf.sql

---
 tests/queries/0_stateless/03274_aliases_in_udf.sql | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/03274_aliases_in_udf.sql b/tests/queries/0_stateless/03274_aliases_in_udf.sql
index c909990d7ce..a6154b81907 100644
--- a/tests/queries/0_stateless/03274_aliases_in_udf.sql
+++ b/tests/queries/0_stateless/03274_aliases_in_udf.sql
@@ -1,11 +1,15 @@
 SELECT 'FIX ISSUE #69143';
 
+DROP FUNCTION IF EXISTS test_function;
+DROP FUNCTION IF EXISTS test_03274;
+DROP TABLE IF EXISTS test_table;
+
 CREATE OR REPLACE FUNCTION test_function AS ( input_column_name ) -> ((
         '1' AS a,
         input_column_name AS input_column_name
     ).2);
 
-CREATE TABLE test_table
+CREATE TABLE IF NOT EXISTS test_table
 (
     `metadata_a` String,
     `metadata_b` String
@@ -30,3 +34,7 @@ SELECT 'EXPAIN SYNTAX OF UDF';
 CREATE OR REPLACE FUNCTION test_03274 AS ( x ) -> ((x + 1 as y, y + 2));
 
 EXPAIN SYNTAX SELECT test_03274(4 + 2);
+
+DROP FUNCTION IF EXISTS test_function;
+DROP FUNCTION IF EXISTS test_03274;
+DROP TABLE IF EXISTS test_table;

From 05b0ae9faa86d862b11404d7294b4559d905b53d Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Thu, 21 Nov 2024 12:37:07 +0100
Subject: [PATCH 12/41] Update 03274_aliases_in_udf.sql

---
 .../queries/0_stateless/03274_aliases_in_udf.sql | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/queries/0_stateless/03274_aliases_in_udf.sql b/tests/queries/0_stateless/03274_aliases_in_udf.sql
index a6154b81907..612249d6eb1 100644
--- a/tests/queries/0_stateless/03274_aliases_in_udf.sql
+++ b/tests/queries/0_stateless/03274_aliases_in_udf.sql
@@ -1,10 +1,10 @@
+-- Tags: no-parallel
+
 SELECT 'FIX ISSUE #69143';
 
-DROP FUNCTION IF EXISTS test_function;
-DROP FUNCTION IF EXISTS test_03274;
 DROP TABLE IF EXISTS test_table;
 
-CREATE OR REPLACE FUNCTION test_function AS ( input_column_name ) -> ((
+CREATE FUNCTION IF NOT EXISTS 03274_test_function AS ( input_column_name ) -> ((
         '1' AS a,
         input_column_name AS input_column_name
     ).2);
@@ -18,10 +18,10 @@ ENGINE = MergeTree()
 ORDER BY tuple();
 
 
-ALTER TABLE test_table ADD COLUMN mat_a String MATERIALIZED test_function(metadata_a);
+ALTER TABLE test_table ADD COLUMN mat_a String MATERIALIZED 03274_test_function(metadata_a);
 ALTER TABLE test_table MATERIALIZE COLUMN `mat_a`;
 
-ALTER TABLE test_table ADD COLUMN mat_b String MATERIALIZED test_function(metadata_b);
+ALTER TABLE test_table ADD COLUMN mat_b String MATERIALIZED 03274_test_function(metadata_b);
 ALTER TABLE test_table MATERIALIZE COLUMN `mat_b`;
 
 INSERT INTO test_table SELECT 'a', 'b';
@@ -31,10 +31,10 @@ SELECT mat_b FROM test_table;
 
 SELECT 'EXPAIN SYNTAX OF UDF';
 
-CREATE OR REPLACE FUNCTION test_03274 AS ( x ) -> ((x + 1 as y, y + 2));
+CREATE FUNCTION IF NOT EXISTS test_03274 AS ( x ) -> ((x + 1 as y, y + 2));
 
 EXPAIN SYNTAX SELECT test_03274(4 + 2);
 
-DROP FUNCTION IF EXISTS test_function;
-DROP FUNCTION IF EXISTS test_03274;
+DROP FUNCTION 03274_test_function;
+DROP FUNCTION test_03274;
 DROP TABLE IF EXISTS test_table;

From b0b78cd4532ec271b9bb23a93a6d41ec20619b16 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Thu, 21 Nov 2024 12:59:46 +0100
Subject: [PATCH 13/41] Update 03274_aliases_in_udf.sql

---
 tests/queries/0_stateless/03274_aliases_in_udf.sql | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/queries/0_stateless/03274_aliases_in_udf.sql b/tests/queries/0_stateless/03274_aliases_in_udf.sql
index 612249d6eb1..6c9c2153513 100644
--- a/tests/queries/0_stateless/03274_aliases_in_udf.sql
+++ b/tests/queries/0_stateless/03274_aliases_in_udf.sql
@@ -29,11 +29,11 @@ INSERT INTO test_table SELECT 'a', 'b';
 SELECT mat_a FROM test_table;
 SELECT mat_b FROM test_table;
 
-SELECT 'EXPAIN SYNTAX OF UDF';
+SELECT 'EXPLAIN SYNTAX OF UDF';
 
 CREATE FUNCTION IF NOT EXISTS test_03274 AS ( x ) -> ((x + 1 as y, y + 2));
 
-EXPAIN SYNTAX SELECT test_03274(4 + 2);
+EXPLAIN SYNTAX SELECT test_03274(4 + 2);
 
 DROP FUNCTION 03274_test_function;
 DROP FUNCTION test_03274;

From e8036e132c2b429354ee823059b1f1dfd9487791 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Thu, 21 Nov 2024 13:08:42 +0100
Subject: [PATCH 14/41] Update 03274_aliases_in_udf.reference

---
 tests/queries/0_stateless/03274_aliases_in_udf.reference | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/03274_aliases_in_udf.reference b/tests/queries/0_stateless/03274_aliases_in_udf.reference
index 2d51e7becae..19c13978405 100644
--- a/tests/queries/0_stateless/03274_aliases_in_udf.reference
+++ b/tests/queries/0_stateless/03274_aliases_in_udf.reference
@@ -1,5 +1,5 @@
 FIX ISSUE #69143
 a
 b
-EXPAIN SYNTAX OF UDF
+EXPLAIN SYNTAX OF UDF
 SELECT ((4 + 2) + 1, ((4 + 2) + 1) + 2)

From 143be4b7af46befd5f3f0f9ddb17c29c1f3862c6 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Thu, 21 Nov 2024 19:46:55 +0100
Subject: [PATCH 15/41] Add a setting for this use-case.

---
 src/Core/Settings.cpp                         | 42 +++++++++++++++++++
 src/Core/SettingsChangesHistory.cpp           |  1 +
 .../UserDefinedSQLFunctionVisitor.cpp         | 19 +++++----
 .../03274_aliases_in_udf.reference            |  1 +
 .../0_stateless/03274_aliases_in_udf.sql      | 13 +++++-
 5 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 140a77011dd..47171705afc 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -2251,6 +2251,48 @@ Result:
 ```
 )", 0) \
     \
+    DECLARE(Bool, skip_redundant_aliases_in_udf, false, R"(
+Redundant aliases are not used (substituted) in user-defined functions in order to simplify it's usage.
+
+Possible values:
+
+- 1 — The aliases are skipped (substituted) in UDFs.
+- 0 — The aliases are not skipped (substituted) in UDFs.
+
+**Example**
+
+The difference between enabled and disabled:
+
+Query:
+
+```sql
+SET skip_redundant_aliases_in_udf = 0;
+CREATE FUNCTION IF NOT EXISTS test_03274 AS ( x ) -> ((x + 1 as y, y + 2));
+
+EXPLAIN SYNTAX SELECT test_03274(4 + 2);
+```
+
+Result:
+
+```text
+SELECT ((4 + 2) + 1 AS y, y + 2)
+```
+
+Query:
+
+```sql
+SET skip_redundant_aliases_in_udf = 1;
+CREATE FUNCTION IF NOT EXISTS test_03274 AS ( x ) -> ((x + 1 as y, y + 2));
+
+EXPLAIN SYNTAX SELECT test_03274(4 + 2);
+```
+
+Result:
+
+```text
+SELECT ((4 + 2) + 1, ((4 + 2) + 1) + 2)
+```
+)", 0) \
     DECLARE(Bool, prefer_global_in_and_join, false, R"(
 Enables the replacement of `IN`/`JOIN` operators with `GLOBAL IN`/`GLOBAL JOIN`.
 
diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index 18a9dd6ecbf..3759e08b290 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -89,6 +89,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
             {"read_in_order_use_virtual_row", false, false, "Use virtual row while reading in order of primary key or its monotonic function fashion. It is useful when searching over multiple parts as only relevant ones are touched."},
             {"filesystem_cache_boundary_alignment", 0, 0, "New setting"},
             {"push_external_roles_in_interserver_queries", false, false, "New setting."},
+            {"skip_redundant_aliases_in_udf", false, false, "New setting."},
         }
     },
     {"24.10",
diff --git a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
index 2ec62b9ce8f..78532c9d73e 100644
--- a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
+++ b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
@@ -103,16 +103,19 @@ ASTPtr UserDefinedSQLFunctionVisitor::tryToReplaceFunction(const ASTFunction & f
 
     auto function_body_to_update = function_core_expression->children.at(1)->clone();
 
-    Aliases aliases;
-    QueryAliasesVisitor(aliases).visit(function_body_to_update);
+    if (context_.getSettingsRef()[Setting::skip_redundant_aliases_in_udf])
+    {
+        Aliases aliases;
+        QueryAliasesVisitor(aliases).visit(function_body_to_update);
 
-    /// Mark table ASTIdentifiers with not a column marker
-    MarkTableIdentifiersVisitor::Data identifiers_data{aliases};
-    MarkTableIdentifiersVisitor(identifiers_data).visit(function_body_to_update);
+        /// Mark table ASTIdentifiers with not a column marker
+        MarkTableIdentifiersVisitor::Data identifiers_data{aliases};
+        MarkTableIdentifiersVisitor(identifiers_data).visit(function_body_to_update);
 
-    /// Common subexpression elimination. Rewrite rules.
-    QueryNormalizer::Data normalizer_data(aliases, {}, true, context_->getSettingsRef(), true, false);
-    QueryNormalizer(normalizer_data).visit(function_body_to_update);
+        /// Common subexpression elimination. Rewrite rules.
+        QueryNormalizer::Data normalizer_data(aliases, {}, true, context_->getSettingsRef(), true, false);
+        QueryNormalizer(normalizer_data).visit(function_body_to_update);
+    }
 
     auto expression_list = std::make_shared<ASTExpressionList>();
     expression_list->children.emplace_back(std::move(function_body_to_update));
diff --git a/tests/queries/0_stateless/03274_aliases_in_udf.reference b/tests/queries/0_stateless/03274_aliases_in_udf.reference
index 19c13978405..07c8e7130b3 100644
--- a/tests/queries/0_stateless/03274_aliases_in_udf.reference
+++ b/tests/queries/0_stateless/03274_aliases_in_udf.reference
@@ -2,4 +2,5 @@ FIX ISSUE #69143
 a
 b
 EXPLAIN SYNTAX OF UDF
+SELECT ((4 + 2) + 1 AS y, y + 2)
 SELECT ((4 + 2) + 1, ((4 + 2) + 1) + 2)
diff --git a/tests/queries/0_stateless/03274_aliases_in_udf.sql b/tests/queries/0_stateless/03274_aliases_in_udf.sql
index 6c9c2153513..db1efcf6853 100644
--- a/tests/queries/0_stateless/03274_aliases_in_udf.sql
+++ b/tests/queries/0_stateless/03274_aliases_in_udf.sql
@@ -1,5 +1,7 @@
 -- Tags: no-parallel
 
+SET skip_redundant_aliases_in_udf = 0;
+
 SELECT 'FIX ISSUE #69143';
 
 DROP TABLE IF EXISTS test_table;
@@ -17,10 +19,13 @@ CREATE TABLE IF NOT EXISTS test_table
 ENGINE = MergeTree()
 ORDER BY tuple();
 
-
 ALTER TABLE test_table ADD COLUMN mat_a String MATERIALIZED 03274_test_function(metadata_a);
 ALTER TABLE test_table MATERIALIZE COLUMN `mat_a`;
 
+ALTER TABLE test_table ADD COLUMN mat_b String MATERIALIZED 03274_test_function(metadata_b); -- { serverError MULTIPLE_EXPRESSIONS_FOR_ALIAS }
+
+SET skip_redundant_aliases_in_udf = 1;
+
 ALTER TABLE test_table ADD COLUMN mat_b String MATERIALIZED 03274_test_function(metadata_b);
 ALTER TABLE test_table MATERIALIZE COLUMN `mat_b`;
 
@@ -33,6 +38,12 @@ SELECT 'EXPLAIN SYNTAX OF UDF';
 
 CREATE FUNCTION IF NOT EXISTS test_03274 AS ( x ) -> ((x + 1 as y, y + 2));
 
+SET skip_redundant_aliases_in_udf = 0;
+
+EXPLAIN SYNTAX SELECT test_03274(4 + 2);
+
+SET skip_redundant_aliases_in_udf = 1;
+
 EXPLAIN SYNTAX SELECT test_03274(4 + 2);
 
 DROP FUNCTION 03274_test_function;

From 9a606ff996747c62bcd19eeb9e8b9fdeae15ee34 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Thu, 21 Nov 2024 20:37:08 +0100
Subject: [PATCH 16/41] fix build

---
 src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
index 78532c9d73e..395b0070576 100644
--- a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
+++ b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
@@ -103,7 +103,7 @@ ASTPtr UserDefinedSQLFunctionVisitor::tryToReplaceFunction(const ASTFunction & f
 
     auto function_body_to_update = function_core_expression->children.at(1)->clone();
 
-    if (context_.getSettingsRef()[Setting::skip_redundant_aliases_in_udf])
+    if (context_->getSettingsRef()[Setting::skip_redundant_aliases_in_udf])
     {
         Aliases aliases;
         QueryAliasesVisitor(aliases).visit(function_body_to_update);

From 5caf748a357a9d2f7b684858689f25c312e1d7f5 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Fri, 22 Nov 2024 13:33:11 +0100
Subject: [PATCH 17/41] Add a setting to namespace.

---
 src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
index 395b0070576..0ceb9764078 100644
--- a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
+++ b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
@@ -18,6 +18,10 @@
 
 namespace DB
 {
+namespace Setting
+{
+    extern const SettingsBool skip_redundant_aliases_in_udf;
+}
 
 namespace ErrorCodes
 {

From e2da09627be11b5cf6bdf0263e5f039c8256eead Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Mon, 25 Nov 2024 19:49:44 +0100
Subject: [PATCH 18/41] Fix build after merge.

---
 src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
index 87061626090..75c62705aa5 100644
--- a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
+++ b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.cpp
@@ -58,7 +58,7 @@ void UserDefinedSQLFunctionVisitor::visit(ASTPtr & ast, ContextPtr context_)
     if (const auto * function = ast->template as<ASTFunction>())
     {
         std::unordered_set<std::string> udf_in_replace_process;
-        auto replace_result = tryToReplaceFunction(*function, udf_in_replace_process);
+        auto replace_result = tryToReplaceFunction(*function, udf_in_replace_process, context_);
         if (replace_result)
             ast = replace_result;
     }

From 4e51f382cb35f8f0cb8df08996bfcf016d00cb92 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Wed, 4 Dec 2024 12:50:24 +0100
Subject: [PATCH 19/41] Try to add setting into the new version

---
 src/Core/SettingsChangesHistory.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index b47655b2d13..855cff25fbc 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -69,6 +69,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
             {"use_async_executor_for_materialized_views", false, false, "New setting."},
             {"composed_data_type_output_format_mode", "default", "default", "New setting"},
             {"http_response_headers", "", "", "New setting."},
+            {"skip_redundant_aliases_in_udf", false, false, "New setting."},
         }
     },
     {"24.11",
@@ -101,7 +102,6 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
             {"enable_variant_type", false, false, "Add alias to allow_experimental_variant_type"},
             {"enable_dynamic_type", false, false, "Add alias to allow_experimental_dynamic_type"},
             {"enable_json_type", false, false, "Add alias to allow_experimental_json_type"},
-            {"skip_redundant_aliases_in_udf", false, false, "New setting."},
         }
     },
     {"24.10",

From 0a85fc6524788692ee5ddfe8c0cdd4a2b53dfee3 Mon Sep 17 00:00:00 2001
From: Sema Checherinda <Sema.Checherinda@clickhouse.com>
Date: Wed, 4 Dec 2024 20:05:46 +0100
Subject: [PATCH 20/41] fix clickhouse_driver inserts

---
 src/Server/TCPHandler.cpp                         |  6 ++++++
 .../0_stateless/03279_with_clickhouse_driver.py   | 15 +++++++++++++++
 .../03279_with_clickhouse_driver.reference        |  1 +
 .../0_stateless/03279_with_clickhouse_driver.sh   |  8 ++++++++
 4 files changed, 30 insertions(+)
 create mode 100644 tests/queries/0_stateless/03279_with_clickhouse_driver.py
 create mode 100644 tests/queries/0_stateless/03279_with_clickhouse_driver.reference
 create mode 100755 tests/queries/0_stateless/03279_with_clickhouse_driver.sh

diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index 6d96905462a..5dd31b8af98 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -37,6 +37,7 @@
 #include <Poco/Net/NetException.h>
 #include <Poco/Net/SocketAddress.h>
 #include <Poco/Util/LayeredConfiguration.h>
+#include "Common/StackTrace.h"
 #include <Common/Exception.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/CurrentThread.h>
@@ -1110,8 +1111,13 @@ void TCPHandler::processInsertQuery(QueryState & state)
                 startInsertQuery(state);
 
             while (receivePacketsExpectDataConcurrentWithExecutor(state))
+            {
                 executor.push(std::move(state.block_for_insert));
 
+                sendLogs(state);
+                sendInsertProfileEvents(state);
+            }
+
             state.read_all_data = true;
 
             executor.finish();
diff --git a/tests/queries/0_stateless/03279_with_clickhouse_driver.py b/tests/queries/0_stateless/03279_with_clickhouse_driver.py
new file mode 100644
index 00000000000..15c97436854
--- /dev/null
+++ b/tests/queries/0_stateless/03279_with_clickhouse_driver.py
@@ -0,0 +1,15 @@
+import sys
+from clickhouse_driver import Client
+
+
+def run(database):
+    client = Client("localhost",user="default",password="")
+    client.execute(f"CREATE TABLE IF NOT EXISTS {database}.test (x Int32) ENGINE = Memory")
+    client.execute(f"INSERT INTO {database}.test (x) VALUES", [{"x": 100}])
+    result = client.execute(f"SELECT * FROM {database}.test")
+    print(result)
+
+
+if __name__ == "__main__":
+    database = sys.argv[1]
+    run(database)
diff --git a/tests/queries/0_stateless/03279_with_clickhouse_driver.reference b/tests/queries/0_stateless/03279_with_clickhouse_driver.reference
new file mode 100644
index 00000000000..5ba0bbef3dc
--- /dev/null
+++ b/tests/queries/0_stateless/03279_with_clickhouse_driver.reference
@@ -0,0 +1 @@
+[(100,)]
diff --git a/tests/queries/0_stateless/03279_with_clickhouse_driver.sh b/tests/queries/0_stateless/03279_with_clickhouse_driver.sh
new file mode 100755
index 00000000000..40493441d0d
--- /dev/null
+++ b/tests/queries/0_stateless/03279_with_clickhouse_driver.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+# Tags: no-fasttest
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+python3 03279_with_clickhouse_driver.py $CLICKHOUSE_DATABASE
\ No newline at end of file

From c866e38454e304c94a3947becba8300e198dd9c8 Mon Sep 17 00:00:00 2001
From: Sema Checherinda <Sema.Checherinda@clickhouse.com>
Date: Wed, 4 Dec 2024 22:47:45 +0100
Subject: [PATCH 21/41] fix test

---
 tests/queries/0_stateless/03279_with_clickhouse_driver.py | 7 +++++--
 tests/queries/0_stateless/03279_with_clickhouse_driver.sh | 8 --------
 2 files changed, 5 insertions(+), 10 deletions(-)
 mode change 100644 => 100755 tests/queries/0_stateless/03279_with_clickhouse_driver.py
 delete mode 100755 tests/queries/0_stateless/03279_with_clickhouse_driver.sh

diff --git a/tests/queries/0_stateless/03279_with_clickhouse_driver.py b/tests/queries/0_stateless/03279_with_clickhouse_driver.py
old mode 100644
new mode 100755
index 15c97436854..43400e48f36
--- a/tests/queries/0_stateless/03279_with_clickhouse_driver.py
+++ b/tests/queries/0_stateless/03279_with_clickhouse_driver.py
@@ -1,4 +1,7 @@
-import sys
+#!/usr/bin/env python3
+# Tags: no-fasttest
+
+import os
 from clickhouse_driver import Client
 
 
@@ -11,5 +14,5 @@ def run(database):
 
 
 if __name__ == "__main__":
-    database = sys.argv[1]
+    database =  os.environ["CLICKHOUSE_DATABASE"]
     run(database)
diff --git a/tests/queries/0_stateless/03279_with_clickhouse_driver.sh b/tests/queries/0_stateless/03279_with_clickhouse_driver.sh
deleted file mode 100755
index 40493441d0d..00000000000
--- a/tests/queries/0_stateless/03279_with_clickhouse_driver.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-# Tags: no-fasttest
-
-CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-# shellcheck source=../shell_config.sh
-. "$CURDIR"/../shell_config.sh
-
-python3 03279_with_clickhouse_driver.py $CLICKHOUSE_DATABASE
\ No newline at end of file

From 95438e0011c23cc8253f155044883d968593e72e Mon Sep 17 00:00:00 2001
From: Sema Checherinda <Sema.Checherinda@clickhouse.com>
Date: Wed, 4 Dec 2024 22:53:05 +0100
Subject: [PATCH 22/41] add clickhouse-driver to the image

---
 docker/test/stateless/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/test/stateless/requirements.txt b/docker/test/stateless/requirements.txt
index cae5b805e1b..c23537d98a9 100644
--- a/docker/test/stateless/requirements.txt
+++ b/docker/test/stateless/requirements.txt
@@ -50,3 +50,4 @@ urllib3==1.26.5
 wadllib==1.3.6
 wheel==0.37.1
 zipp==1.0.0
+clickhouse-driver==0.2.7

From 516fd98327270ed928a0c85c36718c394b8b686c Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Wed, 4 Dec 2024 22:15:35 +0000
Subject: [PATCH 23/41] Automatic style fix

---
 .../queries/0_stateless/03279_with_clickhouse_driver.py  | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/queries/0_stateless/03279_with_clickhouse_driver.py b/tests/queries/0_stateless/03279_with_clickhouse_driver.py
index 43400e48f36..c63850af74d 100755
--- a/tests/queries/0_stateless/03279_with_clickhouse_driver.py
+++ b/tests/queries/0_stateless/03279_with_clickhouse_driver.py
@@ -2,17 +2,20 @@
 # Tags: no-fasttest
 
 import os
+
 from clickhouse_driver import Client
 
 
 def run(database):
-    client = Client("localhost",user="default",password="")
-    client.execute(f"CREATE TABLE IF NOT EXISTS {database}.test (x Int32) ENGINE = Memory")
+    client = Client("localhost", user="default", password="")
+    client.execute(
+        f"CREATE TABLE IF NOT EXISTS {database}.test (x Int32) ENGINE = Memory"
+    )
     client.execute(f"INSERT INTO {database}.test (x) VALUES", [{"x": 100}])
     result = client.execute(f"SELECT * FROM {database}.test")
     print(result)
 
 
 if __name__ == "__main__":
-    database =  os.environ["CLICKHOUSE_DATABASE"]
+    database = os.environ["CLICKHOUSE_DATABASE"]
     run(database)

From 09f68c1c69e38ca63487eadca0a60cf373130b16 Mon Sep 17 00:00:00 2001
From: divanik <dan.ivanik@clickhouse.com>
Date: Fri, 6 Dec 2024 15:22:56 +0000
Subject: [PATCH 24/41] Working state

---
 src/CMakeLists.txt                            |   1 +
 .../DataLakes/DataLakeConfiguration.h         |   4 +-
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 429 +++++++++
 .../DataLakes/Iceberg/IcebergMetadata.h       | 144 +++
 .../DataLakes/Iceberg/ManifestFile.cpp        | 181 ++++
 .../DataLakes/Iceberg/ManifestFile.h          |  71 ++
 .../DataLakes/Iceberg/ManifestFileImpl.h      |  57 ++
 .../DataLakes/Iceberg/SchemaProcessor.cpp     | 363 ++++++++
 .../DataLakes/Iceberg/SchemaProcessor.h       | 107 +++
 .../DataLakes/Iceberg/Snapshot.h              |  44 +
 .../ObjectStorage/DataLakes/Iceberg/Utils.cpp |  48 +
 .../ObjectStorage/DataLakes/Iceberg/Utils.h   |  35 +
 .../DataLakes/IcebergMetadata.cpp             | 839 ------------------
 .../ObjectStorage/DataLakes/IcebergMetadata.h | 243 -----
 .../ObjectStorage/StorageObjectStorage.cpp    |   2 +-
 .../StorageObjectStorageSource.cpp            |  11 +-
 .../integration/test_storage_iceberg/test.py  |   1 -
 17 files changed, 1492 insertions(+), 1088 deletions(-)
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
 create mode 100644 src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
 delete mode 100644 src/Storages/ObjectStorage/DataLakes/IcebergMetadata.cpp
 delete mode 100644 src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7385d1510cf..d0188cea331 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -129,6 +129,7 @@ add_headers_and_sources(dbms Storages/ObjectStorage/S3)
 add_headers_and_sources(dbms Storages/ObjectStorage/HDFS)
 add_headers_and_sources(dbms Storages/ObjectStorage/Local)
 add_headers_and_sources(dbms Storages/ObjectStorage/DataLakes)
+add_headers_and_sources(dbms Storages/ObjectStorage/DataLakes/Iceberg)
 add_headers_and_sources(dbms Common/NamedCollections)
 add_headers_and_sources(dbms Common/Scheduler/Workload)
 
diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
index ede70567da4..f32631c4438 100644
--- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
+++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
@@ -5,7 +5,7 @@
 #include <Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h>
 #include <Storages/ObjectStorage/DataLakes/HudiMetadata.h>
 #include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
-#include <Storages/ObjectStorage/DataLakes/IcebergMetadata.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h>
 #include <Storages/ObjectStorage/HDFS/Configuration.h>
 #include <Storages/ObjectStorage/Local/Configuration.h>
 #include <Storages/ObjectStorage/S3/Configuration.h>
@@ -139,6 +139,8 @@ private:
     }
 };
 
+using IcebergMetadata = Iceberg::IcebergMetadata;
+
 #if USE_AVRO
 #if USE_AWS_S3
 using StorageS3IcebergConfiguration = DataLakeConfiguration<StorageS3Configuration, IcebergMetadata>;
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
new file mode 100644
index 00000000000..42924179246
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -0,0 +1,429 @@
+#include <memory>
+#include <Poco/Logger.h>
+#include "Common/Config/ConfigProcessor.h"
+#include "Common/DateLUT.h"
+#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
+#include "config.h"
+
+#if USE_AVRO
+
+#    include <Columns/ColumnString.h>
+#    include <Columns/ColumnTuple.h>
+#    include <Columns/IColumn.h>
+#    include <Core/Settings.h>
+#    include <DataTypes/DataTypeArray.h>
+#    include <DataTypes/DataTypeDate.h>
+#    include <DataTypes/DataTypeDateTime64.h>
+#    include <DataTypes/DataTypeFactory.h>
+#    include <DataTypes/DataTypeFixedString.h>
+#    include <DataTypes/DataTypeMap.h>
+#    include <DataTypes/DataTypeNullable.h>
+#    include <DataTypes/DataTypeString.h>
+#    include <DataTypes/DataTypeTuple.h>
+#    include <DataTypes/DataTypeUUID.h>
+#    include <DataTypes/DataTypesDecimal.h>
+#    include <DataTypes/DataTypesNumber.h>
+#    include <Formats/FormatFactory.h>
+#    include <IO/ReadBufferFromFileBase.h>
+#    include <IO/ReadBufferFromString.h>
+#    include <IO/ReadHelpers.h>
+#    include <Processors/Formats/Impl/AvroRowInputFormat.h>
+#    include <Storages/ObjectStorage/DataLakes/Common.h>
+#    include <Storages/ObjectStorage/StorageObjectStorageSource.h>
+#    include <Common/logger_useful.h>
+
+#    include <filesystem>
+#    include <sstream>
+
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h"
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/Utils.h"
+
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h"
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h"
+
+namespace DB
+{
+namespace Setting
+{
+extern const SettingsBool allow_data_lake_dynamic_schema;
+}
+
+namespace ErrorCodes
+{
+extern const int FILE_DOESNT_EXIST;
+extern const int ILLEGAL_COLUMN;
+extern const int BAD_ARGUMENTS;
+extern const int UNSUPPORTED_METHOD;
+extern const int LOGICAL_ERROR;
+}
+
+namespace Iceberg
+{
+
+Int32 parseTableSchema(
+    const Poco::JSON::Object::Ptr & metadata_object, IcebergSchemaProcessor & schema_processor, const LoggerPtr & metadata_logger);
+
+std::pair<Int32, Poco::JSON::Object::Ptr>
+parseTableSchemaFromManifestFile(const avro::DataFileReaderBase & manifest_file_reader, const String & manifest_file_name)
+{
+    auto avro_metadata = manifest_file_reader.metadata();
+    auto avro_schema_it = avro_metadata.find("schema");
+    if (avro_schema_it == avro_metadata.end())
+        throw Exception(
+            ErrorCodes::BAD_ARGUMENTS,
+            "Cannot read Iceberg table: manifest file {} doesn't have table schema in its metadata",
+            manifest_file_name);
+    std::vector<uint8_t> schema_json = avro_schema_it->second;
+    String schema_json_string = String(reinterpret_cast<char *>(schema_json.data()), schema_json.size());
+    Poco::JSON::Parser parser;
+    Poco::Dynamic::Var json = parser.parse(schema_json_string);
+    const Poco::JSON::Object::Ptr & schema_object = json.extract<Poco::JSON::Object::Ptr>();
+    Int32 schema_object_id = schema_object->getValue<int>("schema-id");
+    return {schema_object_id, schema_object};
+}
+
+
+IcebergMetadata::IcebergMetadata(
+    ObjectStoragePtr object_storage_,
+    ConfigurationObserverPtr configuration_,
+    const DB::ContextPtr & context_,
+    Int32 metadata_version_,
+    Int32 format_version_,
+    const String & manifest_list_file_,
+    const Poco::JSON::Object::Ptr & object)
+    : WithContext(context_)
+    , object_storage(std::move(object_storage_))
+    , configuration(std::move(configuration_))
+    , schema_processor(IcebergSchemaProcessor())
+    , log(getLogger("IcebergMetadata"))
+    , metadata_version(metadata_version_)
+    , format_version(format_version_)
+    , current_snapshot(manifest_list_file_.empty() ? std::nullopt : std::optional{getSnapshot(manifest_list_file_)})
+{
+    auto schema_id = parseTableSchema(object, schema_processor, log);
+    schema = *(schema_processor.getClickhouseTableSchemaById(schema_id));
+    current_schema_id = schema_id;
+}
+
+std::pair<Poco::JSON::Object::Ptr, Int32> parseTableSchemaV2Method(const Poco::JSON::Object::Ptr & metadata_object)
+{
+    Poco::JSON::Object::Ptr schema;
+    if (!metadata_object->has("current-schema-id"))
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema: 'current-schema-id' field is missing in metadata");
+    auto current_schema_id = metadata_object->getValue<int>("current-schema-id");
+    if (!metadata_object->has("schemas"))
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema: 'schemas' field is missing in metadata");
+    auto schemas = metadata_object->get("schemas").extract<Poco::JSON::Array::Ptr>();
+    if (schemas->size() == 0)
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema: schemas field is empty");
+    for (uint32_t i = 0; i != schemas->size(); ++i)
+    {
+        auto current_schema = schemas->getObject(i);
+        if (!current_schema->has("schema-id"))
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema: 'schema-id' field is missing in schema");
+        }
+        if (current_schema->getValue<int>("schema-id") == current_schema_id)
+        {
+            schema = current_schema;
+            break;
+        }
+    }
+
+    if (!schema)
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, R"(There is no schema with "schema-id" that matches "current-schema-id" in metadata)");
+    if (schema->getValue<int>("schema-id") != current_schema_id)
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, R"(Field "schema-id" of the schema doesn't match "current-schema-id" in metadata)");
+    return {schema, current_schema_id};
+}
+
+std::pair<Poco::JSON::Object::Ptr, Int32> parseTableSchemaV1Method(const Poco::JSON::Object::Ptr & metadata_object)
+{
+    if (!metadata_object->has("schema"))
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema: 'schema' field is missing in metadata");
+    Poco::JSON::Object::Ptr schema = metadata_object->getObject("schema");
+    if (!metadata_object->has("schema"))
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema: 'schema-id' field is missing in schema");
+    auto current_schema_id = schema->getValue<int>("schema-id");
+    return {schema, current_schema_id};
+}
+
+Int32 parseTableSchema(
+    const Poco::JSON::Object::Ptr & metadata_object, IcebergSchemaProcessor & schema_processor, const LoggerPtr & metadata_logger)
+{
+    Int32 format_version = metadata_object->getValue<Int32>("format-version");
+    if (format_version == 2)
+    {
+        auto [schema, current_schema_id] = parseTableSchemaV2Method(metadata_object);
+        schema_processor.addIcebergTableSchema(schema);
+        return current_schema_id;
+    }
+    else
+    {
+        try
+        {
+            auto [schema, current_schema_id] = parseTableSchemaV1Method(metadata_object);
+            schema_processor.addIcebergTableSchema(schema);
+            return current_schema_id;
+        }
+        catch (const Exception & first_error)
+        {
+            if (first_error.code() != ErrorCodes::BAD_ARGUMENTS)
+                throw;
+            try
+            {
+                auto [schema, current_schema_id] = parseTableSchemaV2Method(metadata_object);
+                schema_processor.addIcebergTableSchema(schema);
+                LOG_WARNING(
+                    metadata_logger,
+                    "Iceberg table schema was parsed using v2 specification, but it was impossible to parse it using v1 "
+                    "specification. Be "
+                    "aware that you Iceberg writing engine violates Iceberg specification. Error during parsing {}",
+                    first_error.displayText());
+                return current_schema_id;
+            }
+            catch (const Exception & second_error)
+            {
+                if (first_error.code() != ErrorCodes::BAD_ARGUMENTS)
+                    throw;
+                throw Exception(
+                    ErrorCodes::BAD_ARGUMENTS,
+                    "Cannot parse Iceberg table schema both with v1 and v2 methods. Old method error: {}. New method error: {}",
+                    first_error.displayText(),
+                    second_error.displayText());
+            }
+        }
+    }
+}
+
+/**
+ * Each version of table metadata is stored in a `metadata` directory and
+ * has one of 2 formats:
+ *   1) v<V>.metadata.json, where V - metadata version.
+ *   2) <V>-<random-uuid>.metadata.json, where V - metadata version
+ */
+std::pair<Int32, String>
+getMetadataFileAndVersion(const ObjectStoragePtr & object_storage, const StorageObjectStorage::Configuration & configuration)
+{
+    const auto metadata_files = listFiles(*object_storage, configuration, "metadata", ".metadata.json");
+    if (metadata_files.empty())
+    {
+        throw Exception(
+            ErrorCodes::FILE_DOESNT_EXIST, "The metadata file for Iceberg table with path {} doesn't exist", configuration.getPath());
+    }
+
+    std::vector<std::pair<UInt32, String>> metadata_files_with_versions;
+    metadata_files_with_versions.reserve(metadata_files.size());
+    for (const auto & path : metadata_files)
+    {
+        String file_name(path.begin() + path.find_last_of('/') + 1, path.end());
+        String version_str;
+        /// v<V>.metadata.json
+        if (file_name.starts_with('v'))
+            version_str = String(file_name.begin() + 1, file_name.begin() + file_name.find_first_of('.'));
+        /// <V>-<random-uuid>.metadata.json
+        else
+            version_str = String(file_name.begin(), file_name.begin() + file_name.find_first_of('-'));
+
+        if (!std::all_of(version_str.begin(), version_str.end(), isdigit))
+            throw Exception(
+                ErrorCodes::BAD_ARGUMENTS, "Bad metadata file name: {}. Expected vN.metadata.json where N is a number", file_name);
+        metadata_files_with_versions.emplace_back(std::stoi(version_str), path);
+    }
+
+    /// Get the latest version of metadata file: v<V>.metadata.json
+    return *std::max_element(metadata_files_with_versions.begin(), metadata_files_with_versions.end());
+}
+
+
+DataLakeMetadataPtr IcebergMetadata::create(
+    const ObjectStoragePtr & object_storage, const ConfigurationObserverPtr & configuration, const ContextPtr & local_context)
+{
+    auto configuration_ptr = configuration.lock();
+
+    const auto [metadata_version, metadata_file_path] = getMetadataFileAndVersion(object_storage, *configuration_ptr);
+
+    auto log = getLogger("IcebergMetadata");
+    LOG_DEBUG(log, "Parse metadata {}", metadata_file_path);
+
+    StorageObjectStorageSource::ObjectInfo object_info(metadata_file_path);
+    auto buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, local_context, log);
+
+    String json_str;
+    readJSONObjectPossiblyInvalid(json_str, *buf);
+
+    Poco::JSON::Parser parser; /// For some reason base/base/JSON.h can not parse this json file
+    Poco::Dynamic::Var json = parser.parse(json_str);
+    const Poco::JSON::Object::Ptr & object = json.extract<Poco::JSON::Object::Ptr>();
+
+    IcebergSchemaProcessor schema_processor;
+
+    auto format_version = object->getValue<int>("format-version");
+
+    auto snapshots = object->get("snapshots").extract<Poco::JSON::Array::Ptr>();
+
+    String manifest_list_file;
+    auto current_snapshot_id = object->getValue<Int64>("current-snapshot-id");
+
+    LOG_DEBUG(&Poco::Logger::get("IcebergMetadata initialize"), "Current snapshot id {}", current_snapshot_id);
+
+    for (size_t i = 0; i < snapshots->size(); ++i)
+    {
+        const auto snapshot = snapshots->getObject(static_cast<UInt32>(i));
+        LOG_DEBUG(
+            &Poco::Logger::get("IcebergMetadata initialize"),
+            "Iterationg on snapshot with id {}",
+            snapshot->getValue<Int64>("snapshot-id"));
+
+        if (snapshot->getValue<Int64>("snapshot-id") == current_snapshot_id)
+        {
+            const auto path = snapshot->getValue<String>("manifest-list");
+            manifest_list_file = std::filesystem::path(configuration_ptr->getPath()) / "metadata" / std::filesystem::path(path).filename();
+            break;
+        }
+    }
+
+    auto ptr = std::make_unique<IcebergMetadata>(
+        object_storage, configuration_ptr, local_context, metadata_version, format_version, manifest_list_file, object);
+
+
+    return ptr;
+}
+
+/**
+ * Manifest file has the following format: '/iceberg_data/db/table_name/metadata/c87bfec7-d36c-4075-ad04-600b6b0f2020-m0.avro'
+ *
+ * `manifest file` is different in format version V1 and V2 and has the following contents:
+ *                        v1     v2
+ * status                 req    req
+ * snapshot_id            req    opt
+ * sequence_number               opt
+ * file_sequence_number          opt
+ * data_file              req    req
+ * Example format version V1:
+ * ┌─status─┬─────────snapshot_id─┬─data_file───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+ * │      1 │ 2819310504515118887 │ ('/iceberg_data/db/table_name/data/00000-1-3edca534-15a0-4f74-8a28-4733e0bf1270-00001.parquet','PARQUET',(),100,1070,67108864,[(1,233),(2,210)],[(1,100),(2,100)],[(1,0),(2,0)],[],[(1,'\0'),(2,'0')],[(1,'c'),(2,'99')],NULL,[4],0) │
+ * └────────┴─────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+ * Example format version V2:
+ * ┌─status─┬─────────snapshot_id─┬─sequence_number─┬─file_sequence_number─┬─data_file───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+ * │      1 │ 5887006101709926452 │            ᴺᵁᴸᴸ │                 ᴺᵁᴸᴸ │ (0,'/iceberg_data/db/table_name/data/00000-1-c8045c90-8799-4eac-b957-79a0484e223c-00001.parquet','PARQUET',(),100,1070,[(1,233),(2,210)],[(1,100),(2,100)],[(1,0),(2,0)],[],[(1,'\0'),(2,'0')],[(1,'c'),(2,'99')],NULL,[4],[],0) │
+ * └────────┴─────────────────────┴─────────────────┴──────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+ * In case of partitioned data we'll have extra directory partition=value:
+ * ─status─┬─────────snapshot_id─┬─data_file──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+ * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=0/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00001.parquet','PARQUET',(0),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0\0'),(2,'1')],[(1,'\0\0\0\0\0\0\0\0'),(2,'1')],NULL,[4],0) │
+ * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=1/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00002.parquet','PARQUET',(1),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'2')],[(1,'\0\0\0\0\0\0\0'),(2,'2')],NULL,[4],0) │
+ * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=2/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00003.parquet','PARQUET',(2),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'3')],[(1,'\0\0\0\0\0\0\0'),(2,'3')],NULL,[4],0) │
+ * └────────┴─────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+ */
+
+ManifestList IcebergMetadata::initializeManifestList(const String & manifest_list_file) const
+{
+    auto configuration_ptr = configuration.lock();
+    if (configuration_ptr == nullptr)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration is expired");
+
+    auto context = getContext();
+    StorageObjectStorageSource::ObjectInfo object_info(manifest_list_file);
+    auto manifest_list_buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, context, log);
+
+    LOG_DEBUG(&Poco::Logger::get("initializeManifestList"), "Parse manifest list {}", manifest_list_file);
+    auto manifest_list_file_reader
+        = std::make_unique<avro::DataFileReaderBase>(std::make_unique<AvroInputStreamReadBufferAdapter>(*manifest_list_buf));
+
+    LOG_DEBUG(&Poco::Logger::get("initializeManifestList"), "Parsed manifest list {}", manifest_list_file);
+
+    auto data_type = AvroSchemaReader::avroNodeToDataType(manifest_list_file_reader->dataSchema().root()->leafAt(0));
+    Block header{{data_type->createColumn(), data_type, "manifest_path"}};
+    auto columns = parseAvro(*manifest_list_file_reader, header, getFormatSettings(context));
+    auto & col = columns.at(0);
+
+    if (col->getDataType() != TypeIndex::String)
+    {
+        throw Exception(
+            ErrorCodes::ILLEGAL_COLUMN,
+            "The parsed column from Avro file of `manifest_path` field should be String type, got {}",
+            col->getFamilyName());
+    }
+
+    const auto * col_str = typeid_cast<ColumnString *>(col.get());
+    std::vector<ManifestFileEntry> manifest_files;
+    for (size_t i = 0; i < col_str->size(); ++i)
+    {
+        const auto file_path = col_str->getDataAt(i).toView();
+        const auto filename = std::filesystem::path(file_path).filename();
+        String manifest_file = std::filesystem::path(configuration_ptr->getPath()) / "metadata" / filename;
+        auto manifest_file_it = manifest_files_by_name.find(manifest_file);
+        if (manifest_file_it != manifest_files_by_name.end())
+        {
+            manifest_files.emplace_back(manifest_file_it);
+            continue;
+        }
+        manifest_files.emplace_back(initializeManifestFile(filename, configuration_ptr));
+    }
+
+    return ManifestList{manifest_files};
+}
+
+ManifestFileEntry IcebergMetadata::initializeManifestFile(const String & filename, const ConfigurationPtr & configuration_ptr) const
+{
+    String manifest_file = std::filesystem::path(configuration_ptr->getPath()) / "metadata" / filename;
+
+    StorageObjectStorageSource::ObjectInfo manifest_object_info(manifest_file);
+    auto buffer = StorageObjectStorageSource::createReadBuffer(manifest_object_info, object_storage, getContext(), log);
+    auto manifest_file_reader = std::make_unique<avro::DataFileReaderBase>(std::make_unique<AvroInputStreamReadBufferAdapter>(*buffer));
+    auto [schema_id, schema_object] = parseTableSchemaFromManifestFile(*manifest_file_reader, filename);
+    auto manifest_file_impl = std::make_unique<ManifestFileContentImpl>(
+        std::move(manifest_file_reader), format_version, configuration_ptr->getPath(), getFormatSettings(getContext()), schema_id);
+    auto [manifest_file_iterator, _inserted]
+        = manifest_files_by_name.emplace(manifest_file, ManifestFileContent(std::move(manifest_file_impl)));
+    ManifestFileEntry manifest_file_entry{manifest_file_iterator};
+    for (const auto & data_file : manifest_file_entry.getContent().getDataFiles())
+    {
+        manifest_entry_by_data_file.emplace(data_file.data_file_name, manifest_file_entry);
+    }
+    LOG_DEBUG(&Poco::Logger::get("IcebergMetadata"), "Added manifest file {}", manifest_file);
+
+    schema_processor.addIcebergTableSchema(schema_object);
+    return manifest_file_entry;
+}
+
+
+IcebergSnapshot IcebergMetadata::getSnapshot(const String & manifest_list_file) const
+{
+    const auto manifest_list_file_it = manifest_lists_by_name.find(manifest_list_file);
+    if (manifest_list_file_it != manifest_lists_by_name.end())
+        return IcebergSnapshot(manifest_list_file_it);
+    return IcebergSnapshot{manifest_lists_by_name.emplace(manifest_list_file, initializeManifestList(manifest_list_file)).first};
+}
+
+
+Strings IcebergMetadata::getDataFiles() const
+{
+    std::lock_guard lock(get_data_files_mutex);
+    if (!data_files.empty())
+        return data_files;
+
+    if (!current_snapshot)
+    {
+        return {};
+    }
+
+    for (const auto & manifest_entry : current_snapshot->getManifestList().getManifestFiles())
+    {
+        for (const auto & data_file : manifest_entry.getContent().getDataFiles())
+        {
+            if (data_file.status != ManifestEntryStatus::DELETED)
+            {
+                data_files.push_back(data_file.data_file_name);
+            }
+        }
+    }
+
+    return data_files;
+}
+
+}
+
+}
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
new file mode 100644
index 00000000000..2091649a0ec
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -0,0 +1,144 @@
+#pragma once
+#include "config.h"
+
+#if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format.
+
+#    include <Core/Types.h>
+#    include <Disks/ObjectStorages/IObjectStorage.h>
+#    include <Interpreters/Context_fwd.h>
+#    include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
+#    include <Storages/ObjectStorage/StorageObjectStorage.h>
+
+#    include <Poco/JSON/Array.h>
+#    include <Poco/JSON/Object.h>
+#    include <Poco/JSON/Parser.h>
+
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h"
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h"
+
+#    include <memory>
+#    include <mutex>
+#    include <optional>
+#    include <unordered_map>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+extern const int BAD_ARGUMENTS;
+}
+
+namespace Iceberg
+{
+
+class IcebergMetadata : public IDataLakeMetadata, private WithContext
+{
+public:
+    using ConfigurationObserverPtr = StorageObjectStorage::ConfigurationObserverPtr;
+    using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr;
+
+
+    static constexpr auto name = "Iceberg";
+
+    IcebergMetadata(
+        ObjectStoragePtr object_storage_,
+        ConfigurationObserverPtr configuration_,
+        const DB::ContextPtr & context_,
+        Int32 metadata_version_,
+        Int32 format_version_,
+        const String & manifest_list_file_,
+        const Poco::JSON::Object::Ptr & object);
+
+
+    /// Get data files. On first request it reads manifest_list file and iterates through manifest files to find all data files.
+    /// All subsequent calls will return saved list of files (because it cannot be changed without changing metadata file)
+    Strings getDataFiles() const override;
+
+    /// Get table schema parsed from metadata.
+    NamesAndTypesList getTableSchema() const override { return schema; }
+
+    const std::unordered_map<String, String> & getColumnNameToPhysicalNameMapping() const override { return column_name_to_physical_name; }
+
+    const DataLakePartitionColumns & getPartitionColumns() const override { return partition_columns; }
+
+    bool operator==(const IDataLakeMetadata & other) const override
+    {
+        const auto * iceberg_metadata = dynamic_cast<const IcebergMetadata *>(&other);
+        return iceberg_metadata && getVersion() == iceberg_metadata->getVersion();
+    }
+
+    static DataLakeMetadataPtr
+    create(const ObjectStoragePtr & object_storage, const ConfigurationObserverPtr & configuration, const ContextPtr & local_context);
+
+    size_t getVersion() const { return metadata_version; }
+
+    std::shared_ptr<NamesAndTypesList> getInitialSchemaByPath(const String & data_path) const override
+    {
+        auto version_if_outdated = getSchemaVersionByFileIfOutdated(data_path);
+        return version_if_outdated.has_value() ? schema_processor.getClickhouseTableSchemaById(version_if_outdated.value()) : nullptr;
+    }
+
+    std::shared_ptr<const ActionsDAG> getSchemaTransformer(const String & data_path) const override
+    {
+        auto version_if_outdated = getSchemaVersionByFileIfOutdated(data_path);
+        return version_if_outdated.has_value()
+            ? schema_processor.getSchemaTransformationDagByIds(version_if_outdated.value(), current_schema_id)
+            : nullptr;
+    }
+
+    bool supportsExternalMetadataChange() const override { return true; }
+
+private:
+    using ManifestEntryByDataFile = std::unordered_map<String, ManifestFileEntry>;
+
+    const ObjectStoragePtr object_storage;
+    const ConfigurationObserverPtr configuration;
+
+    mutable IcebergSchemaProcessor schema_processor;
+    LoggerPtr log;
+
+    mutable ManifestFilesByName manifest_files_by_name;
+    mutable ManifestListsByName manifest_lists_by_name;
+    mutable ManifestEntryByDataFile manifest_entry_by_data_file;
+
+    Int32 metadata_version;
+    Int32 format_version;
+    Int32 current_schema_id;
+
+    std::optional<IcebergSnapshot> current_snapshot;
+
+    mutable Strings data_files;
+    std::unordered_map<String, String> column_name_to_physical_name;
+    DataLakePartitionColumns partition_columns;
+    NamesAndTypesList schema;
+
+    mutable std::mutex get_data_files_mutex;
+
+    ManifestList initializeManifestList(const String & manifest_list_file) const;
+
+    IcebergSnapshot getSnapshot(const String & manifest_list_file) const;
+
+    std::optional<Int32> getSchemaVersionByFileIfOutdated(String data_path) const
+    {
+        auto manifest_file_it = manifest_entry_by_data_file.find(data_path);
+        if (manifest_file_it == manifest_entry_by_data_file.end())
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot find schema version for data file: {}", data_path);
+        }
+        auto schema_id = manifest_file_it->second.getContent().getSchemaId();
+        if (schema_id == current_schema_id)
+            return std::nullopt;
+        return std::optional{schema_id};
+    }
+
+    ManifestFileEntry getManifestFile(const String & manifest_file) const;
+
+    ManifestFileEntry initializeManifestFile(const String & filename, const ConfigurationPtr & configuration_ptr) const;
+};
+
+}
+}
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
new file mode 100644
index 00000000000..1819d8bf599
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
@@ -0,0 +1,181 @@
+#include "config.h"
+
+#if USE_AVRO
+
+
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h"
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/Utils.h"
+
+
+#    include <Core/Types.h>
+#    include <Disks/ObjectStorages/IObjectStorage.h>
+#    include <Interpreters/Context_fwd.h>
+#    include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
+#    include <Storages/ObjectStorage/StorageObjectStorage.h>
+#    include <DataFile.hh>
+
+#    include <Poco/JSON/Array.h>
+#    include <Poco/JSON/Object.h>
+#    include <Poco/JSON/Parser.h>
+
+#    include <Common/Exception.h>
+#    include "DataTypes/DataTypeTuple.h"
+#    include "Formats/FormatSettings.h"
+
+#    include <Processors/Formats/Impl/AvroRowInputFormat.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+extern const int ILLEGAL_COLUMN;
+}
+
+namespace Iceberg
+{
+
+const std::vector<DataFileEntry> & ManifestFileContent::getDataFiles() const
+{
+    return impl->data_files;
+}
+
+Int32 ManifestFileContent::getSchemaId() const
+{
+    return impl->schema_id;
+}
+
+ManifestFileContent::ManifestFileContent(std::unique_ptr<ManifestFileContentImpl> impl_) : impl(std::move(impl_))
+{
+}
+
+
+ManifestFileContentImpl::ManifestFileContentImpl(
+    std::unique_ptr<avro::DataFileReaderBase> manifest_file_reader_,
+    Int32 format_version_,
+    const String & common_path,
+    const FormatSettings & format_settings,
+    Int32 schema_id_)
+{
+    this->schema_id = schema_id_;
+    avro::NodePtr root_node = manifest_file_reader_->dataSchema().root();
+    size_t leaves_num = root_node->leaves();
+    size_t expected_min_num = format_version_ == 1 ? 3 : 2;
+    if (leaves_num < expected_min_num)
+    {
+        throw Exception(
+            ErrorCodes::BAD_ARGUMENTS, "Unexpected number of columns {}. Expected at least {}", root_node->leaves(), expected_min_num);
+    }
+
+    avro::NodePtr status_node = root_node->leafAt(0);
+    if (status_node->type() != avro::Type::AVRO_INT)
+    {
+        throw Exception(
+            ErrorCodes::ILLEGAL_COLUMN,
+            "The parsed column from Avro file of `status` field should be Int type, got {}",
+            magic_enum::enum_name(status_node->type()));
+    }
+
+    avro::NodePtr data_file_node = root_node->leafAt(static_cast<int>(leaves_num) - 1);
+    if (data_file_node->type() != avro::Type::AVRO_RECORD)
+    {
+        throw Exception(
+            ErrorCodes::ILLEGAL_COLUMN,
+            "The parsed column from Avro file of `data_file` field should be Tuple type, got {}",
+            magic_enum::enum_name(data_file_node->type()));
+    }
+
+    auto status_col_data_type = AvroSchemaReader::avroNodeToDataType(status_node);
+    auto data_col_data_type = AvroSchemaReader::avroNodeToDataType(data_file_node);
+    Block manifest_file_header
+        = {{status_col_data_type->createColumn(), status_col_data_type, "status"},
+           {data_col_data_type->createColumn(), data_col_data_type, "data_file"}};
+
+    auto columns = parseAvro(*manifest_file_reader_, manifest_file_header, format_settings);
+    if (columns.size() != 2)
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Unexpected number of columns. Expected 2, got {}", columns.size());
+
+    if (columns.at(0)->getDataType() != TypeIndex::Int32)
+    {
+        throw Exception(
+            ErrorCodes::ILLEGAL_COLUMN,
+            "The parsed column from Avro file of `status` field should be Int32 type, got {}",
+            columns.at(0)->getFamilyName());
+    }
+    if (columns.at(1)->getDataType() != TypeIndex::Tuple)
+    {
+        throw Exception(
+            ErrorCodes::ILLEGAL_COLUMN,
+            "The parsed column from Avro file of `file_path` field should be Tuple type, got {}",
+            columns.at(1)->getFamilyName());
+    }
+
+    const auto * status_int_column = assert_cast<ColumnInt32 *>(columns.at(0).get());
+    const auto & data_file_tuple_type = assert_cast<const DataTypeTuple &>(*data_col_data_type.get());
+    const auto * data_file_tuple_column = assert_cast<ColumnTuple *>(columns.at(1).get());
+
+    if (status_int_column->size() != data_file_tuple_column->size())
+    {
+        throw Exception(
+            ErrorCodes::ILLEGAL_COLUMN,
+            "The parsed column from Avro file of `file_path` and `status` have different rows number: {} and {}",
+            status_int_column->size(),
+            data_file_tuple_column->size());
+    }
+
+    ColumnPtr file_path_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName("file_path"));
+
+    if (file_path_column->getDataType() != TypeIndex::String)
+    {
+        throw Exception(
+            ErrorCodes::ILLEGAL_COLUMN,
+            "The parsed column from Avro file of `file_path` field should be String type, got {}",
+            file_path_column->getFamilyName());
+    }
+
+    const auto * file_path_string_column = assert_cast<const ColumnString *>(file_path_column.get());
+
+    ColumnPtr content_column;
+    const ColumnInt32 * content_int_column = nullptr;
+    if (format_version_ == 2)
+    {
+        content_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName("content"));
+        if (content_column->getDataType() != TypeIndex::Int32)
+        {
+            throw Exception(
+                ErrorCodes::ILLEGAL_COLUMN,
+                "The parsed column from Avro file of `content` field should be Int type, got {}",
+                content_column->getFamilyName());
+        }
+
+        content_int_column = assert_cast<const ColumnInt32 *>(content_column.get());
+    }
+
+    for (size_t i = 0; i < data_file_tuple_column->size(); ++i)
+    {
+        DataFileContent content_type = DataFileContent::DATA;
+        if (format_version_ == 2)
+        {
+            content_type = DataFileContent(content_int_column->getElement(i));
+            if (content_type != DataFileContent::DATA)
+                throw Exception(
+                    ErrorCodes::UNSUPPORTED_METHOD, "Cannot read Iceberg table: positional and equality deletes are not supported");
+        }
+        const auto status = ManifestEntryStatus(status_int_column->getInt(i));
+
+        const auto data_path = std::string(file_path_string_column->getDataAt(i).toView());
+        const auto pos = data_path.find(common_path);
+        if (pos == std::string::npos)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected to find {} in data path: {}", common_path, data_path);
+
+        const auto file_path = data_path.substr(pos);
+        this->data_files.push_back({file_path, status, content_type});
+    }
+}
+
+}
+
+}
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
new file mode 100644
index 00000000000..9dd98799917
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -0,0 +1,71 @@
+#pragma once
+#include <memory>
+#include "config.h"
+
+#if USE_AVRO
+
+#    include <cstdint>
+#    include <Common/Exception.h>
+
+
+namespace DB
+{
+
+
+namespace Iceberg
+{
+class ManifestFileContentImpl;
+
+enum class ManifestEntryStatus : uint8_t
+{
+    EXISTING = 0,
+    ADDED = 1,
+    DELETED = 2,
+
+};
+
+enum class DataFileContent : uint8_t
+{
+    DATA = 0,
+    POSITION_DELETES = 1,
+    EQUALITY_DELETES = 2,
+};
+
+struct DataFileEntry
+{
+    String data_file_name;
+    ManifestEntryStatus status;
+    DataFileContent content;
+};
+
+class ManifestFileContent
+{
+public:
+    explicit ManifestFileContent(std::unique_ptr<ManifestFileContentImpl> impl_);
+
+    const std::vector<DataFileEntry> & getDataFiles() const;
+    Int32 getSchemaId() const;
+
+private:
+    std::unique_ptr<ManifestFileContentImpl> impl;
+};
+
+
+using ManifestFilesByName = std::map<String, ManifestFileContent>;
+
+struct ManifestFileEntry
+{
+    explicit ManifestFileEntry(const ManifestFilesByName::const_iterator & reference_) : reference(reference_) { }
+    const ManifestFileContent & getContent() const { return reference->second; }
+    const String & getName() const { return reference->first; }
+
+
+private:
+    ManifestFilesByName::const_iterator reference;
+};
+
+}
+
+}
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
new file mode 100644
index 00000000000..75618a6a44b
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
@@ -0,0 +1,57 @@
+#include "config.h"
+
+#if USE_AVRO
+
+
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/Utils.h"
+
+
+#    include <Core/Types.h>
+#    include <Disks/ObjectStorages/IObjectStorage.h>
+#    include <Interpreters/Context_fwd.h>
+#    include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
+#    include <Storages/ObjectStorage/StorageObjectStorage.h>
+#    include <DataFile.hh>
+
+#    include <Poco/JSON/Array.h>
+#    include <Poco/JSON/Object.h>
+#    include <Poco/JSON/Parser.h>
+
+#    include <Common/Exception.h>
+#    include "DataTypes/DataTypeTuple.h"
+#    include "Formats/FormatSettings.h"
+
+#    include <Processors/Formats/Impl/AvroRowInputFormat.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+extern const int ILLEGAL_COLUMN;
+}
+
+namespace Iceberg
+{
+
+class ManifestFileContentImpl
+{
+public:
+    explicit ManifestFileContentImpl(
+        std::unique_ptr<avro::DataFileReaderBase> manifest_file_reader_,
+        Int32 format_version_,
+        const String & common_path,
+        const FormatSettings & format_settings,
+        Int32 schema_id_);
+
+    Int32 schema_id;
+    std::vector<DataFileEntry> data_files;
+};
+
+}
+
+}
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
new file mode 100644
index 00000000000..886626c6de0
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
@@ -0,0 +1,363 @@
+#include "Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h"
+
+#include <Poco/JSON/Array.h>
+#include <Poco/JSON/Object.h>
+#include <Poco/JSON/Parser.h>
+
+#include <IO/ReadBufferFromString.h>
+#include <Common/Exception.h>
+
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeDateTime64.h>
+#include <DataTypes/DataTypeFactory.h>
+#include <DataTypes/DataTypeFixedString.h>
+#include <DataTypes/DataTypeMap.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/DataTypeUUID.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Formats/FormatFactory.h>
+
+namespace ErrorCodes
+{
+extern const int LOGICAL_ERROR;
+}
+
+namespace DB
+{
+
+namespace Iceberg
+{
+
+
+namespace
+{
+
+bool operator==(const Poco::JSON::Object & first, const Poco::JSON::Object & second)
+{
+    std::stringstream first_string_stream; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+    std::stringstream second_string_stream; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+    first.stringify(first_string_stream);
+    if (!first_string_stream)
+    {
+        throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "JSON Parsing failed");
+    }
+    second.stringify(second_string_stream);
+    if (!second_string_stream)
+    {
+        throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "JSON Parsing failed");
+    }
+    return first_string_stream.str() == second_string_stream.str();
+}
+
+// bool operator!=(const Poco::JSON::Object & first, const Poco::JSON::Object & second)
+// {
+//     return !(first == second);
+// }
+
+std::pair<size_t, size_t> parseDecimal(const String & type_name)
+{
+    DB::ReadBufferFromString buf(std::string_view(type_name.begin() + 8, type_name.end() - 1));
+    size_t precision;
+    size_t scale;
+    readIntText(precision, buf);
+    skipWhitespaceIfAny(buf);
+    assertChar(',', buf);
+    skipWhitespaceIfAny(buf);
+    tryReadIntText(scale, buf);
+    return {precision, scale};
+}
+
+}
+
+void IcebergSchemaProcessor::addIcebergTableSchema(Poco::JSON::Object::Ptr schema_ptr)
+{
+    Int32 schema_id = schema_ptr->getValue<Int32>("schema-id");
+    if (iceberg_table_schemas_by_ids.contains(schema_id))
+    {
+        chassert(clickhouse_table_schemas_by_ids.contains(schema_id));
+        chassert(*iceberg_table_schemas_by_ids.at(schema_id) == *schema_ptr);
+    }
+    else
+    {
+        iceberg_table_schemas_by_ids[schema_id] = schema_ptr;
+        auto fields = schema_ptr->get("fields").extract<Poco::JSON::Array::Ptr>();
+        auto clickhouse_schema = std::make_shared<NamesAndTypesList>();
+        for (size_t i = 0; i != fields->size(); ++i)
+        {
+            auto field = fields->getObject(static_cast<UInt32>(i));
+            auto name = field->getValue<String>("name");
+            bool required = field->getValue<bool>("required");
+            clickhouse_schema->push_back(NameAndTypePair{name, getFieldType(field, "type", required)});
+        }
+        clickhouse_table_schemas_by_ids[schema_id] = clickhouse_schema;
+    }
+}
+
+
+DataTypePtr IcebergSchemaProcessor::getSimpleType(const String & type_name)
+{
+    if (type_name == "boolean")
+        return DataTypeFactory::instance().get("Bool");
+    if (type_name == "int")
+        return std::make_shared<DataTypeInt32>();
+    if (type_name == "long")
+        return std::make_shared<DataTypeInt64>();
+    if (type_name == "float")
+        return std::make_shared<DataTypeFloat32>();
+    if (type_name == "double")
+        return std::make_shared<DataTypeFloat64>();
+    if (type_name == "date")
+        return std::make_shared<DataTypeDate>();
+    if (type_name == "time")
+        return std::make_shared<DataTypeInt64>();
+    if (type_name == "timestamp")
+        return std::make_shared<DataTypeDateTime64>(6);
+    if (type_name == "timestamptz")
+        return std::make_shared<DataTypeDateTime64>(6, "UTC");
+    if (type_name == "string" || type_name == "binary")
+        return std::make_shared<DataTypeString>();
+    if (type_name == "uuid")
+        return std::make_shared<DataTypeUUID>();
+
+    if (type_name.starts_with("fixed[") && type_name.ends_with(']'))
+    {
+        ReadBufferFromString buf(std::string_view(type_name.begin() + 6, type_name.end() - 1));
+        size_t n;
+        readIntText(n, buf);
+        return std::make_shared<DataTypeFixedString>(n);
+    }
+
+    if (type_name.starts_with("decimal(") && type_name.ends_with(')'))
+    {
+        ReadBufferFromString buf(std::string_view(type_name.begin() + 8, type_name.end() - 1));
+        auto [precision, scale] = parseDecimal(type_name);
+        return createDecimal<DataTypeDecimal>(precision, scale);
+    }
+
+    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown Iceberg type: {}", type_name);
+}
+
+DataTypePtr IcebergSchemaProcessor::getComplexTypeFromObject(const Poco::JSON::Object::Ptr & type)
+{
+    String type_name = type->getValue<String>("type");
+    if (type_name == "list")
+    {
+        bool element_required = type->getValue<bool>("element-required");
+        auto element_type = getFieldType(type, "element", element_required);
+        return std::make_shared<DataTypeArray>(element_type);
+    }
+
+    if (type_name == "map")
+    {
+        auto key_type = getFieldType(type, "key", true);
+        auto value_required = type->getValue<bool>("value-required");
+        auto value_type = getFieldType(type, "value", value_required);
+        return std::make_shared<DataTypeMap>(key_type, value_type);
+    }
+
+    if (type_name == "struct")
+    {
+        DataTypes element_types;
+        Names element_names;
+        auto fields = type->get("fields").extract<Poco::JSON::Array::Ptr>();
+        element_types.reserve(fields->size());
+        element_names.reserve(fields->size());
+        for (size_t i = 0; i != fields->size(); ++i)
+        {
+            auto field = fields->getObject(static_cast<Int32>(i));
+            element_names.push_back(field->getValue<String>("name"));
+            auto required = field->getValue<bool>("required");
+            element_types.push_back(getFieldType(field, "type", required));
+        }
+
+        return std::make_shared<DataTypeTuple>(element_types, element_names);
+    }
+
+    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown Iceberg type: {}", type_name);
+}
+
+DataTypePtr IcebergSchemaProcessor::getFieldType(const Poco::JSON::Object::Ptr & field, const String & type_key, bool required)
+{
+    if (field->isObject(type_key))
+        return getComplexTypeFromObject(field->getObject(type_key));
+
+    auto type = field->get(type_key);
+    if (type.isString())
+    {
+        const String & type_name = type.extract<String>();
+        auto data_type = getSimpleType(type_name);
+        return required ? data_type : makeNullable(data_type);
+    }
+
+    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected 'type' field: {}", type.toString());
+}
+
+
+/**
+* Iceberg allows only three types of primitive type conversion:
+* int -> long
+* float -> double
+* decimal(P, S) -> decimal(P', S) where P' > P
+* This function checks if `old_type` and `new_type` satisfy to one of these conditions.
+**/
+bool IcebergSchemaProcessor::allowPrimitiveTypeConversion(const String & old_type, const String & new_type)
+{
+    bool allowed_type_conversion = (old_type == new_type);
+    allowed_type_conversion |= (old_type == "int") && (new_type == "long");
+    allowed_type_conversion |= (old_type == "float") && (new_type == "double");
+    if (old_type.starts_with("decimal(") && old_type.ends_with(')') && new_type.starts_with("decimal(") && new_type.ends_with(")"))
+    {
+        auto [old_precision, old_scale] = parseDecimal(old_type);
+        auto [new_precision, new_scale] = parseDecimal(new_type);
+        allowed_type_conversion |= (old_precision <= new_precision) && (old_scale == new_scale);
+    }
+    return allowed_type_conversion;
+}
+
+// Ids are passed only for error logging purposes
+std::shared_ptr<ActionsDAG> IcebergSchemaProcessor::getSchemaTransformationDag(
+    const Poco::JSON::Object::Ptr & old_schema, const Poco::JSON::Object::Ptr & new_schema, Int32 old_id, Int32 new_id)
+{
+    std::unordered_map<size_t, std::pair<Poco::JSON::Object::Ptr, const ActionsDAG::Node *>> old_schema_entries;
+    auto old_schema_fields = old_schema->get("fields").extract<Poco::JSON::Array::Ptr>();
+    std::shared_ptr<ActionsDAG> dag = std::make_shared<ActionsDAG>();
+    auto & outputs = dag->getOutputs();
+    for (size_t i = 0; i != old_schema_fields->size(); ++i)
+    {
+        auto field = old_schema_fields->getObject(static_cast<UInt32>(i));
+        size_t id = field->getValue<size_t>("id");
+        auto name = field->getValue<String>("name");
+        bool required = field->getValue<bool>("required");
+        old_schema_entries[id] = {field, &dag->addInput(name, getFieldType(field, "type", required))};
+    }
+    auto new_schema_fields = new_schema->get("fields").extract<Poco::JSON::Array::Ptr>();
+    for (size_t i = 0; i != new_schema_fields->size(); ++i)
+    {
+        auto field = new_schema_fields->getObject(static_cast<UInt32>(i));
+        size_t id = field->getValue<size_t>("id");
+        auto name = field->getValue<String>("name");
+        bool required = field->getValue<bool>("required");
+        auto type = getFieldType(field, "type", required);
+        auto old_node_it = old_schema_entries.find(id);
+        if (old_node_it != old_schema_entries.end())
+        {
+            auto [old_json, old_node] = old_node_it->second;
+            if (field->isObject("type"))
+            {
+                if (*old_json != *field)
+                {
+                    throw Exception(
+                        ErrorCodes::UNSUPPORTED_METHOD,
+                        "Schema evolution is not supported for complex types yet, field id is {}, old schema id is {}, new schema id "
+                        "is {}",
+                        id,
+                        old_id,
+                        new_id);
+                }
+                else
+                {
+                    outputs.push_back(old_node);
+                }
+            }
+            else
+            {
+                if (old_json->isObject("type"))
+                {
+                    throw Exception(
+                        ErrorCodes::LOGICAL_ERROR,
+                        "Can't cast primitive type to the complex type, field id is {}, old schema id is {}, new schema id is {}",
+                        id,
+                        old_id,
+                        new_id);
+                }
+                String old_type = old_json->getValue<String>("type");
+                String new_type = field->getValue<String>("type");
+
+                const ActionsDAG::Node * node = old_node;
+                if (old_type == new_type)
+                {
+                    if (old_json->getValue<String>("name") != name)
+                    {
+                        node = &dag->addAlias(*old_node, name);
+                    }
+                }
+                else if (allowPrimitiveTypeConversion(old_type, new_type))
+                {
+                    node = &dag->addCast(*old_node, getFieldType(field, "type", required), name);
+                }
+                outputs.push_back(node);
+            }
+        }
+        else
+        {
+            if (field->isObject("type"))
+            {
+                throw Exception(
+                    ErrorCodes::UNSUPPORTED_METHOD,
+                    "Adding a default column with id {} and complex type is not supported yet. Old schema id is {}, new schema id is "
+                    "{}",
+                    id,
+                    old_id,
+                    new_id);
+            }
+            if (!type->isNullable())
+            {
+                throw Exception(
+                    ErrorCodes::LOGICAL_ERROR,
+                    "Cannot add a column with id {} with required values to the table during schema evolution. This is forbidden by "
+                    "Iceberg format specification. Old schema id is {}, new "
+                    "schema id is {}",
+                    id,
+                    old_id,
+                    new_id);
+            }
+            ColumnPtr default_type_column = type->createColumnConstWithDefaultValue(0);
+            const auto & constant = dag->addColumn({default_type_column, type, name});
+            outputs.push_back(&constant);
+        }
+    }
+    return dag;
+}
+
+std::shared_ptr<const ActionsDAG> IcebergSchemaProcessor::getSchemaTransformationDagByIds(Int32 old_id, Int32 new_id)
+{
+    if (old_id == new_id)
+    {
+        return nullptr;
+    }
+    std::lock_guard lock(mutex);
+    auto required_transform_dag_it = transform_dags_by_ids.find({old_id, new_id});
+    if (required_transform_dag_it != transform_dags_by_ids.end())
+    {
+        return required_transform_dag_it->second;
+    }
+
+    auto old_schema_it = iceberg_table_schemas_by_ids.find(old_id);
+    if (old_schema_it == iceberg_table_schemas_by_ids.end())
+    {
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Schema with schema-id {} is unknown", old_id);
+    }
+    auto new_schema_it = iceberg_table_schemas_by_ids.find(new_id);
+    if (new_schema_it == iceberg_table_schemas_by_ids.end())
+    {
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Schema with schema-id {} is unknown", new_id);
+    }
+    return transform_dags_by_ids[{old_id, new_id}]
+        = getSchemaTransformationDag(old_schema_it->second, new_schema_it->second, old_id, new_id);
+}
+
+std::shared_ptr<NamesAndTypesList> IcebergSchemaProcessor::getClickhouseTableSchemaById(Int32 id)
+{
+    auto it = clickhouse_table_schemas_by_ids.find(id);
+    if (it == clickhouse_table_schemas_by_ids.end())
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Schema with id {} is unknown", id);
+    return it->second;
+}
+
+}
+
+}
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
new file mode 100644
index 00000000000..755f8e940b4
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include "config.h"
+
+
+#include <Core/Types.h>
+#include <Disks/ObjectStorages/IObjectStorage.h>
+#include <Interpreters/Context_fwd.h>
+#include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
+#include <Storages/ObjectStorage/StorageObjectStorage.h>
+
+#include <Poco/JSON/Array.h>
+#include <Poco/JSON/Object.h>
+#include <Poco/JSON/Parser.h>
+
+namespace DB
+{
+
+namespace Iceberg
+{
+
+
+/**
+ * Iceberg supports the next data types (see https://iceberg.apache.org/spec/#schemas-and-data-types):
+ * - Primitive types:
+ *   - boolean
+ *   - int
+ *   - long
+ *   - float
+ *   - double
+ *   - decimal(P, S)
+ *   - date
+ *   - time (time of day in microseconds since midnight)
+ *   - timestamp (in microseconds since 1970-01-01)
+ *   - timestamptz (timestamp with timezone, stores values in UTC timezone)
+ *   - string
+ *   - uuid
+ *   - fixed(L) (fixed-length byte array of length L)
+ *   - binary
+ * - Complex types:
+ *   - struct(field1: Type1, field2: Type2, ...) (tuple of typed values)
+ *   - list(nested_type)
+ *   - map(Key, Value)
+ *
+ * Example of table schema in metadata:
+ * {
+ *     "type" : "struct",
+ *     "schema-id" : 0,
+ *     "fields" : [
+ *     {
+ *         "id" : 1,
+ *         "name" : "id",
+ *         "required" : false,
+ *         "type" : "long"
+ *     },
+ *     {
+ *         "id" : 2,
+ *         "name" : "array",
+ *         "required" : false,
+ *         "type" : {
+ *             "type" : "list",
+ *             "element-id" : 5,
+ *             "element" : "int",
+ *             "element-required" : false
+ *     },
+ *     {
+ *         "id" : 3,
+ *         "name" : "data",
+ *         "required" : false,
+ *         "type" : "binary"
+ *     }
+ * }
+ */
+class IcebergSchemaProcessor
+{
+    using Node = ActionsDAG::Node;
+
+public:
+    void addIcebergTableSchema(Poco::JSON::Object::Ptr schema_ptr);
+    std::shared_ptr<NamesAndTypesList> getClickhouseTableSchemaById(Int32 id);
+    std::shared_ptr<const ActionsDAG> getSchemaTransformationDagByIds(Int32 old_id, Int32 new_id);
+
+private:
+    std::unordered_map<Int32, Poco::JSON::Object::Ptr> iceberg_table_schemas_by_ids;
+    std::unordered_map<Int32, std::shared_ptr<NamesAndTypesList>> clickhouse_table_schemas_by_ids;
+    std::map<std::pair<Int32, Int32>, std::shared_ptr<ActionsDAG>> transform_dags_by_ids;
+
+    NamesAndTypesList getSchemaType(const Poco::JSON::Object::Ptr & schema);
+    DataTypePtr getComplexTypeFromObject(const Poco::JSON::Object::Ptr & type);
+    DataTypePtr getFieldType(const Poco::JSON::Object::Ptr & field, const String & type_key, bool required);
+    DataTypePtr getSimpleType(const String & type_name);
+
+    bool allowPrimitiveTypeConversion(const String & old_type, const String & new_type);
+    const Node * getDefaultNodeForField(const Poco::JSON::Object::Ptr & field);
+
+    std::shared_ptr<ActionsDAG> getSchemaTransformationDag(
+        const Poco::JSON::Object::Ptr & old_schema, const Poco::JSON::Object::Ptr & new_schema, Int32 old_id, Int32 new_id);
+
+    std::mutex mutex;
+};
+
+}
+
+}
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
new file mode 100644
index 00000000000..d75ebd6b2ab
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
@@ -0,0 +1,44 @@
+#pragma once
+#include "config.h"
+
+#if USE_AVRO
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
+
+namespace DB
+{
+
+namespace Iceberg
+{
+
+
+class ManifestList
+{
+public:
+    explicit ManifestList(std::vector<ManifestFileEntry> manifest_files_) : manifest_files(std::move(manifest_files_)) { }
+    const std::vector<ManifestFileEntry> & getManifestFiles() const { return manifest_files; }
+
+private:
+    std::vector<ManifestFileEntry> manifest_files;
+};
+
+using ManifestListsByName = std::map<String, ManifestList>;
+
+class IcebergSnapshot
+{
+public:
+    explicit IcebergSnapshot(const ManifestListsByName::const_iterator & reference_) : reference(reference_) { }
+
+    const ManifestList & getManifestList() const { return reference->second; }
+    const String & getName() const { return reference->first; }
+
+
+private:
+    // Int32 snapshot_id;
+    ManifestListsByName::const_iterator reference;
+};
+
+}
+
+}
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
new file mode 100644
index 00000000000..c5b1e5f65c8
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
@@ -0,0 +1,48 @@
+
+#include "config.h"
+
+#if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format.
+
+#    include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
+
+#    include <Core/Types.h>
+#    include <Disks/ObjectStorages/IObjectStorage.h>
+#    include <Interpreters/Context_fwd.h>
+#    include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
+#    include <Storages/ObjectStorage/StorageObjectStorage.h>
+#    include <DataFile.hh>
+
+#    include <Poco/JSON/Array.h>
+#    include <Poco/JSON/Object.h>
+#    include <Poco/JSON/Parser.h>
+
+#    include <Common/Exception.h>
+#    include "DataTypes/DataTypeTuple.h"
+#    include "Formats/FormatSettings.h"
+
+#    include <Processors/Formats/Impl/AvroRowInputFormat.h>
+
+namespace DB
+{
+
+namespace Iceberg
+{
+MutableColumns parseAvro(avro::DataFileReaderBase & file_reader, const Block & header, const FormatSettings & settings)
+{
+    auto deserializer = std::make_unique<DB::AvroDeserializer>(header, file_reader.dataSchema(), true, true, settings);
+    MutableColumns columns = header.cloneEmptyColumns();
+
+    file_reader.init();
+    RowReadExtension ext;
+    while (file_reader.hasMore())
+    {
+        file_reader.decr();
+        deserializer->deserializeRow(columns, file_reader.decoder(), ext);
+    }
+    return columns;
+}
+
+}
+}
+
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
new file mode 100644
index 00000000000..69515e50055
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "config.h"
+
+#if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format.
+
+
+#    include <Core/Types.h>
+#    include <Disks/ObjectStorages/IObjectStorage.h>
+#    include <Interpreters/Context_fwd.h>
+#    include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
+#    include <Storages/ObjectStorage/StorageObjectStorage.h>
+#    include <DataFile.hh>
+
+#    include <Poco/JSON/Array.h>
+#    include <Poco/JSON/Object.h>
+#    include <Poco/JSON/Parser.h>
+
+#    include <Common/Exception.h>
+#    include "Formats/FormatSettings.h"
+
+#    include <Processors/Formats/Impl/AvroRowInputFormat.h>
+
+namespace DB
+{
+
+namespace Iceberg
+{
+
+MutableColumns parseAvro(avro::DataFileReaderBase & file_reader, const Block & header, const FormatSettings & settings);
+
+}
+
+}
+#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.cpp
deleted file mode 100644
index 980d2f479cb..00000000000
--- a/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.cpp
+++ /dev/null
@@ -1,839 +0,0 @@
-#include <mutex>
-#include "config.h"
-
-#if USE_AVRO
-
-#include <Common/logger_useful.h>
-#include <Core/Settings.h>
-#include <Columns/ColumnString.h>
-#include <Columns/ColumnTuple.h>
-#include <Columns/IColumn.h>
-#include <DataTypes/DataTypeArray.h>
-#include <DataTypes/DataTypeDate.h>
-#include <DataTypes/DataTypeDateTime64.h>
-#include <DataTypes/DataTypeFactory.h>
-#include <DataTypes/DataTypeFixedString.h>
-#include <DataTypes/DataTypeMap.h>
-#include <DataTypes/DataTypeNullable.h>
-#include <DataTypes/DataTypeString.h>
-#include <DataTypes/DataTypeTuple.h>
-#include <DataTypes/DataTypeUUID.h>
-#include <DataTypes/DataTypesDecimal.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <Formats/FormatFactory.h>
-#include <IO/ReadBufferFromString.h>
-#include <IO/ReadBufferFromFileBase.h>
-#include <IO/ReadHelpers.h>
-#include <Processors/Formats/Impl/AvroRowInputFormat.h>
-#include <Storages/ObjectStorage/DataLakes/IcebergMetadata.h>
-#include <Storages/ObjectStorage/DataLakes/Common.h>
-#include <Storages/ObjectStorage/StorageObjectStorageSource.h>
-
-#include <Poco/JSON/Array.h>
-#include <Poco/JSON/Object.h>
-#include <Poco/JSON/Parser.h>
-
-#include <filesystem>
-#include <sstream>
-
-namespace DB
-{
-namespace Setting
-{
-extern const SettingsBool allow_data_lake_dynamic_schema;
-}
-
-namespace ErrorCodes
-{
-    extern const int FILE_DOESNT_EXIST;
-    extern const int ILLEGAL_COLUMN;
-    extern const int BAD_ARGUMENTS;
-    extern const int UNSUPPORTED_METHOD;
-    extern const int LOGICAL_ERROR;
-}
-
-Int32 parseTableSchema(
-    const Poco::JSON::Object::Ptr & metadata_object, IcebergSchemaProcessor & schema_processor, LoggerPtr metadata_logger);
-
-IcebergMetadata::IcebergMetadata(
-    ObjectStoragePtr object_storage_,
-    ConfigurationObserverPtr configuration_,
-    const DB::ContextPtr & context_,
-    Int32 metadata_version_,
-    Int32 format_version_,
-    String manifest_list_file_,
-    const Poco::JSON::Object::Ptr & object)
-    : WithContext(context_)
-    , object_storage(std::move(object_storage_))
-    , configuration(std::move(configuration_))
-    , metadata_version(metadata_version_)
-    , format_version(format_version_)
-    , manifest_list_file(std::move(manifest_list_file_))
-    , schema_processor(IcebergSchemaProcessor())
-    , log(getLogger("IcebergMetadata"))
-{
-    auto schema_id = parseTableSchema(object, schema_processor, log);
-    schema = *(schema_processor.getClickhouseTableSchemaById(schema_id));
-    current_schema_id = schema_id;
-}
-
-namespace
-{
-
-enum class ManifestEntryStatus : uint8_t
-{
-    EXISTING = 0,
-    ADDED = 1,
-    DELETED = 2,
-};
-
-enum class DataFileContent : uint8_t
-{
-    DATA = 0,
-    POSITION_DELETES = 1,
-    EQUALITY_DELETES = 2,
-};
-
-std::pair<size_t, size_t> parseDecimal(const String & type_name)
-{
-    ReadBufferFromString buf(std::string_view(type_name.begin() + 8, type_name.end() - 1));
-    size_t precision;
-    size_t scale;
-    readIntText(precision, buf);
-    skipWhitespaceIfAny(buf);
-    assertChar(',', buf);
-    skipWhitespaceIfAny(buf);
-    tryReadIntText(scale, buf);
-    return {precision, scale};
-}
-
-bool operator==(const Poco::JSON::Object & first, const Poco::JSON::Object & second)
-{
-    std::stringstream first_string_stream; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
-    std::stringstream second_string_stream; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
-    first.stringify(first_string_stream);
-    if (!first_string_stream)
-    {
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "JSON Parsing failed");
-    }
-    second.stringify(second_string_stream);
-    if (!second_string_stream)
-    {
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "JSON Parsing failed");
-    }
-    return first_string_stream.str() == second_string_stream.str();
-}
-
-bool operator!=(const Poco::JSON::Object & first, const Poco::JSON::Object & second)
-{
-    return !(first == second);
-}
-}
-
-
-DataTypePtr IcebergSchemaProcessor::getSimpleType(const String & type_name)
-{
-    if (type_name == "boolean")
-        return DataTypeFactory::instance().get("Bool");
-    if (type_name == "int")
-        return std::make_shared<DataTypeInt32>();
-    if (type_name == "long")
-        return std::make_shared<DataTypeInt64>();
-    if (type_name == "float")
-        return std::make_shared<DataTypeFloat32>();
-    if (type_name == "double")
-        return std::make_shared<DataTypeFloat64>();
-    if (type_name == "date")
-        return std::make_shared<DataTypeDate>();
-    if (type_name == "time")
-        return std::make_shared<DataTypeInt64>();
-    if (type_name == "timestamp")
-        return std::make_shared<DataTypeDateTime64>(6);
-    if (type_name == "timestamptz")
-        return std::make_shared<DataTypeDateTime64>(6, "UTC");
-    if (type_name == "string" || type_name == "binary")
-        return std::make_shared<DataTypeString>();
-    if (type_name == "uuid")
-        return std::make_shared<DataTypeUUID>();
-
-    if (type_name.starts_with("fixed[") && type_name.ends_with(']'))
-    {
-        ReadBufferFromString buf(std::string_view(type_name.begin() + 6, type_name.end() - 1));
-        size_t n;
-        readIntText(n, buf);
-        return std::make_shared<DataTypeFixedString>(n);
-    }
-
-    if (type_name.starts_with("decimal(") && type_name.ends_with(')'))
-    {
-        ReadBufferFromString buf(std::string_view(type_name.begin() + 8, type_name.end() - 1));
-        auto [precision, scale] = parseDecimal(type_name);
-        return createDecimal<DataTypeDecimal>(precision, scale);
-    }
-
-    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown Iceberg type: {}", type_name);
-}
-
-DataTypePtr IcebergSchemaProcessor::getComplexTypeFromObject(const Poco::JSON::Object::Ptr & type)
-{
-    String type_name = type->getValue<String>("type");
-    if (type_name == "list")
-    {
-        bool element_required = type->getValue<bool>("element-required");
-        auto element_type = getFieldType(type, "element", element_required);
-        return std::make_shared<DataTypeArray>(element_type);
-    }
-
-    if (type_name == "map")
-    {
-        auto key_type = getFieldType(type, "key", true);
-        auto value_required = type->getValue<bool>("value-required");
-        auto value_type = getFieldType(type, "value", value_required);
-        return std::make_shared<DataTypeMap>(key_type, value_type);
-    }
-
-    if (type_name == "struct")
-    {
-        DataTypes element_types;
-        Names element_names;
-        auto fields = type->get("fields").extract<Poco::JSON::Array::Ptr>();
-        element_types.reserve(fields->size());
-        element_names.reserve(fields->size());
-        for (size_t i = 0; i != fields->size(); ++i)
-        {
-            auto field = fields->getObject(static_cast<Int32>(i));
-            element_names.push_back(field->getValue<String>("name"));
-            auto required = field->getValue<bool>("required");
-            element_types.push_back(getFieldType(field, "type", required));
-        }
-
-        return std::make_shared<DataTypeTuple>(element_types, element_names);
-    }
-
-    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown Iceberg type: {}", type_name);
-}
-
-DataTypePtr IcebergSchemaProcessor::getFieldType(const Poco::JSON::Object::Ptr & field, const String & type_key, bool required)
-{
-    if (field->isObject(type_key))
-        return getComplexTypeFromObject(field->getObject(type_key));
-
-    auto type = field->get(type_key);
-    if (type.isString())
-    {
-        const String & type_name = type.extract<String>();
-        auto data_type = getSimpleType(type_name);
-        return required ? data_type : makeNullable(data_type);
-    }
-
-    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected 'type' field: {}", type.toString());
-
-}
-
-
-/**
-* Iceberg allows only three types of primitive type conversion:
-* int -> long
-* float -> double
-* decimal(P, S) -> decimal(P', S) where P' > P
-* This function checks if `old_type` and `new_type` satisfy to one of these conditions.
-**/
-bool IcebergSchemaProcessor::allowPrimitiveTypeConversion(const String & old_type, const String & new_type)
-{
-    bool allowed_type_conversion = (old_type == new_type);
-    allowed_type_conversion |= (old_type == "int") && (new_type == "long");
-    allowed_type_conversion |= (old_type == "float") && (new_type == "double");
-    if (old_type.starts_with("decimal(") && old_type.ends_with(')') && new_type.starts_with("decimal(") && new_type.ends_with(")"))
-    {
-        auto [old_precision, old_scale] = parseDecimal(old_type);
-        auto [new_precision, new_scale] = parseDecimal(new_type);
-        allowed_type_conversion |= (old_precision <= new_precision) && (old_scale == new_scale);
-    }
-    return allowed_type_conversion;
-}
-std::pair<Poco::JSON::Object::Ptr, Int32> parseTableSchemaV2Method(const Poco::JSON::Object::Ptr & metadata_object)
-{
-    Poco::JSON::Object::Ptr schema;
-    if (!metadata_object->has("current-schema-id"))
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema: 'current-schema-id' field is missing in metadata");
-    auto current_schema_id = metadata_object->getValue<int>("current-schema-id");
-    if (!metadata_object->has("schemas"))
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema: 'schemas' field is missing in metadata");
-    auto schemas = metadata_object->get("schemas").extract<Poco::JSON::Array::Ptr>();
-    if (schemas->size() == 0)
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema: schemas field is empty");
-    for (uint32_t i = 0; i != schemas->size(); ++i)
-    {
-        auto current_schema = schemas->getObject(i);
-        if (!current_schema->has("schema-id"))
-        {
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema: 'schema-id' field is missing in schema");
-        }
-        if (current_schema->getValue<int>("schema-id") == current_schema_id)
-        {
-            schema = current_schema;
-            break;
-        }
-    }
-
-    if (!schema)
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, R"(There is no schema with "schema-id" that matches "current-schema-id" in metadata)");
-    if (schema->getValue<int>("schema-id") != current_schema_id)
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, R"(Field "schema-id" of the schema doesn't match "current-schema-id" in metadata)");
-    return {schema, current_schema_id};
-}
-
-std::pair<Poco::JSON::Object::Ptr, Int32> parseTableSchemaV1Method(const Poco::JSON::Object::Ptr & metadata_object)
-{
-    if (!metadata_object->has("schema"))
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema: 'schema' field is missing in metadata");
-    Poco::JSON::Object::Ptr schema = metadata_object->getObject("schema");
-    if (!metadata_object->has("schema"))
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse Iceberg table schema: 'schema-id' field is missing in schema");
-    auto current_schema_id = schema->getValue<int>("schema-id");
-    return {schema, current_schema_id};
-}
-
-Int32 parseTableSchema(
-    const Poco::JSON::Object::Ptr & metadata_object, IcebergSchemaProcessor & schema_processor, LoggerPtr metadata_logger)
-{
-    Int32 format_version = metadata_object->getValue<Int32>("format-version");
-    if (format_version == 2)
-    {
-        auto [schema, current_schema_id] = parseTableSchemaV2Method(metadata_object);
-        schema_processor.addIcebergTableSchema(schema);
-        return current_schema_id;
-    }
-    else
-    {
-        try
-        {
-            auto [schema, current_schema_id] = parseTableSchemaV1Method(metadata_object);
-            schema_processor.addIcebergTableSchema(schema);
-            return current_schema_id;
-        }
-        catch (const Exception & first_error)
-        {
-            if (first_error.code() != ErrorCodes::BAD_ARGUMENTS)
-                throw;
-            try
-            {
-                auto [schema, current_schema_id] = parseTableSchemaV2Method(metadata_object);
-                schema_processor.addIcebergTableSchema(schema);
-                LOG_WARNING(
-                    metadata_logger,
-                    "Iceberg table schema was parsed using v2 specification, but it was impossible to parse it using v1 "
-                    "specification. Be "
-                    "aware that you Iceberg writing engine violates Iceberg specification. Error during parsing {}",
-                    first_error.displayText());
-                return current_schema_id;
-            }
-            catch (const Exception & second_error)
-            {
-                if (first_error.code() != ErrorCodes::BAD_ARGUMENTS)
-                    throw;
-                throw Exception(
-                    ErrorCodes::BAD_ARGUMENTS,
-                    "Cannot parse Iceberg table schema both with v1 and v2 methods. Old method error: {}. New method error: {}",
-                    first_error.displayText(),
-                    second_error.displayText());
-            }
-        }
-    }
-}
-
-
-// Ids are passed only for error logging purposes
-std::shared_ptr<ActionsDAG> IcebergSchemaProcessor::getSchemaTransformationDag(
-    const Poco::JSON::Object::Ptr & old_schema, const Poco::JSON::Object::Ptr & new_schema, Int32 old_id, Int32 new_id)
-{
-    std::unordered_map<size_t, std::pair<Poco::JSON::Object::Ptr, const ActionsDAG::Node *>> old_schema_entries;
-    auto old_schema_fields = old_schema->get("fields").extract<Poco::JSON::Array::Ptr>();
-    std::shared_ptr<ActionsDAG> dag = std::make_shared<ActionsDAG>();
-    auto & outputs = dag->getOutputs();
-    for (size_t i = 0; i != old_schema_fields->size(); ++i)
-    {
-        auto field = old_schema_fields->getObject(static_cast<UInt32>(i));
-        size_t id = field->getValue<size_t>("id");
-        auto name = field->getValue<String>("name");
-        bool required = field->getValue<bool>("required");
-        old_schema_entries[id] = {field, &dag->addInput(name, getFieldType(field, "type", required))};
-    }
-    auto new_schema_fields = new_schema->get("fields").extract<Poco::JSON::Array::Ptr>();
-    for (size_t i = 0; i != new_schema_fields->size(); ++i)
-    {
-        auto field = new_schema_fields->getObject(static_cast<UInt32>(i));
-        size_t id = field->getValue<size_t>("id");
-        auto name = field->getValue<String>("name");
-        bool required = field->getValue<bool>("required");
-        auto type = getFieldType(field, "type", required);
-        auto old_node_it = old_schema_entries.find(id);
-        if (old_node_it != old_schema_entries.end())
-        {
-            auto [old_json, old_node] = old_node_it->second;
-            if (field->isObject("type"))
-            {
-                if (*old_json != *field)
-                {
-                    throw Exception(
-                        ErrorCodes::UNSUPPORTED_METHOD,
-                        "Schema evolution is not supported for complex types yet, field id is {}, old schema id is {}, new schema id "
-                        "is {}",
-                        id,
-                        old_id,
-                        new_id);
-                }
-                else
-                {
-                    outputs.push_back(old_node);
-                }
-            }
-            else
-            {
-                if (old_json->isObject("type"))
-                {
-                    throw Exception(
-                        ErrorCodes::LOGICAL_ERROR,
-                        "Can't cast primitive type to the complex type, field id is {}, old schema id is {}, new schema id is {}",
-                        id,
-                        old_id,
-                        new_id);
-                }
-                String old_type = old_json->getValue<String>("type");
-                String new_type = field->getValue<String>("type");
-
-                const ActionsDAG::Node * node = old_node;
-                if (old_type == new_type)
-                {
-                    if (old_json->getValue<String>("name") != name)
-                    {
-                        node = &dag->addAlias(*old_node, name);
-                    }
-                }
-                else if (allowPrimitiveTypeConversion(old_type, new_type))
-                {
-                    node = &dag->addCast(*old_node, getFieldType(field, "type", required), name);
-                }
-                outputs.push_back(node);
-            }
-        }
-        else
-        {
-            if (field->isObject("type"))
-            {
-                throw Exception(
-                    ErrorCodes::UNSUPPORTED_METHOD,
-                    "Adding a default column with id {} and complex type is not supported yet. Old schema id is {}, new schema id is "
-                    "{}",
-                    id,
-                    old_id,
-                    new_id);
-            }
-            if (!type->isNullable())
-            {
-                throw Exception(
-                    ErrorCodes::LOGICAL_ERROR,
-                    "Cannot add a column with id {} with required values to the table during schema evolution. This is forbidden by "
-                    "Iceberg format specification. Old schema id is {}, new "
-                    "schema id is {}",
-                    id,
-                    old_id,
-                    new_id);
-            }
-            ColumnPtr default_type_column = type->createColumnConstWithDefaultValue(0);
-            const auto & constant = dag->addColumn({default_type_column, type, name});
-            outputs.push_back(&constant);
-        }
-    }
-    return dag;
-}
-
-std::shared_ptr<const ActionsDAG> IcebergSchemaProcessor::getSchemaTransformationDagByIds(Int32 old_id, Int32 new_id)
-{
-    if (old_id == new_id)
-    {
-        return nullptr;
-    }
-    std::lock_guard lock(mutex);
-    auto required_transform_dag_it = transform_dags_by_ids.find({old_id, new_id});
-    if (required_transform_dag_it != transform_dags_by_ids.end())
-    {
-        return required_transform_dag_it->second;
-    }
-
-    auto old_schema_it = iceberg_table_schemas_by_ids.find(old_id);
-    if (old_schema_it == iceberg_table_schemas_by_ids.end())
-    {
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Schema with schema-id {} is unknown", old_id);
-    }
-    auto new_schema_it = iceberg_table_schemas_by_ids.find(new_id);
-    if (new_schema_it == iceberg_table_schemas_by_ids.end())
-    {
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Schema with schema-id {} is unknown", new_id);
-    }
-    return transform_dags_by_ids[{old_id, new_id}]
-        = getSchemaTransformationDag(old_schema_it->second, new_schema_it->second, old_id, new_id);
-}
-
-void IcebergSchemaProcessor::addIcebergTableSchema(Poco::JSON::Object::Ptr schema_ptr)
-{
-    Int32 schema_id = schema_ptr->getValue<Int32>("schema-id");
-    if (iceberg_table_schemas_by_ids.contains(schema_id))
-    {
-        chassert(clickhouse_table_schemas_by_ids.contains(schema_id));
-        chassert(*iceberg_table_schemas_by_ids.at(schema_id) == *schema_ptr);
-    }
-    else
-    {
-        iceberg_table_schemas_by_ids[schema_id] = schema_ptr;
-        auto fields = schema_ptr->get("fields").extract<Poco::JSON::Array::Ptr>();
-        auto clickhouse_schema = std::make_shared<NamesAndTypesList>();
-        for (size_t i = 0; i != fields->size(); ++i)
-        {
-            auto field = fields->getObject(static_cast<UInt32>(i));
-            auto name = field->getValue<String>("name");
-            bool required = field->getValue<bool>("required");
-            clickhouse_schema->push_back(NameAndTypePair{name, getFieldType(field, "type", required)});
-        }
-        clickhouse_table_schemas_by_ids[schema_id] = clickhouse_schema;
-    }
-}
-
-std::shared_ptr<NamesAndTypesList> IcebergSchemaProcessor::getClickhouseTableSchemaById(Int32 id)
-{
-    auto it = clickhouse_table_schemas_by_ids.find(id);
-    if (it == clickhouse_table_schemas_by_ids.end())
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Schema with id {} is unknown", id);
-    return it->second;
-}
-
-MutableColumns parseAvro(avro::DataFileReaderBase & file_reader, const Block & header, const FormatSettings & settings)
-{
-    auto deserializer = std::make_unique<AvroDeserializer>(header, file_reader.dataSchema(), true, true, settings);
-    MutableColumns columns = header.cloneEmptyColumns();
-
-    file_reader.init();
-    RowReadExtension ext;
-    while (file_reader.hasMore())
-    {
-        file_reader.decr();
-        deserializer->deserializeRow(columns, file_reader.decoder(), ext);
-    }
-    return columns;
-}
-
-/**
- * Each version of table metadata is stored in a `metadata` directory and
- * has one of 2 formats:
- *   1) v<V>.metadata.json, where V - metadata version.
- *   2) <V>-<random-uuid>.metadata.json, where V - metadata version
- */
-std::pair<Int32, String>
-getMetadataFileAndVersion(const ObjectStoragePtr & object_storage, const StorageObjectStorage::Configuration & configuration)
-{
-    const auto metadata_files = listFiles(*object_storage, configuration, "metadata", ".metadata.json");
-    if (metadata_files.empty())
-    {
-        throw Exception(
-            ErrorCodes::FILE_DOESNT_EXIST,
-            "The metadata file for Iceberg table with path {} doesn't exist",
-            configuration.getPath());
-    }
-
-    std::vector<std::pair<UInt32, String>> metadata_files_with_versions;
-    metadata_files_with_versions.reserve(metadata_files.size());
-    for (const auto & path : metadata_files)
-    {
-        String file_name(path.begin() + path.find_last_of('/') + 1, path.end());
-        String version_str;
-        /// v<V>.metadata.json
-        if (file_name.starts_with('v'))
-            version_str = String(file_name.begin() + 1, file_name.begin() + file_name.find_first_of('.'));
-        /// <V>-<random-uuid>.metadata.json
-        else
-            version_str = String(file_name.begin(), file_name.begin() + file_name.find_first_of('-'));
-
-        if (!std::all_of(version_str.begin(), version_str.end(), isdigit))
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad metadata file name: {}. Expected vN.metadata.json where N is a number", file_name);
-        metadata_files_with_versions.emplace_back(std::stoi(version_str), path);
-    }
-
-    /// Get the latest version of metadata file: v<V>.metadata.json
-    return *std::max_element(metadata_files_with_versions.begin(), metadata_files_with_versions.end());
-}
-
-
-DataLakeMetadataPtr IcebergMetadata::create(
-    const ObjectStoragePtr & object_storage, const ConfigurationObserverPtr & configuration, const ContextPtr & local_context)
-{
-    auto configuration_ptr = configuration.lock();
-
-    const auto [metadata_version, metadata_file_path] = getMetadataFileAndVersion(object_storage, *configuration_ptr);
-
-    auto log = getLogger("IcebergMetadata");
-    LOG_DEBUG(log, "Parse metadata {}", metadata_file_path);
-
-    StorageObjectStorageSource::ObjectInfo object_info(metadata_file_path);
-    auto buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, local_context, log);
-
-    String json_str;
-    readJSONObjectPossiblyInvalid(json_str, *buf);
-
-    Poco::JSON::Parser parser; /// For some reason base/base/JSON.h can not parse this json file
-    Poco::Dynamic::Var json = parser.parse(json_str);
-    const Poco::JSON::Object::Ptr & object = json.extract<Poco::JSON::Object::Ptr>();
-
-    IcebergSchemaProcessor schema_processor;
-
-    auto format_version = object->getValue<int>("format-version");
-
-    auto snapshots = object->get("snapshots").extract<Poco::JSON::Array::Ptr>();
-
-    String manifest_list_file;
-    auto current_snapshot_id = object->getValue<Int64>("current-snapshot-id");
-
-    for (size_t i = 0; i < snapshots->size(); ++i)
-    {
-        const auto snapshot = snapshots->getObject(static_cast<UInt32>(i));
-        if (snapshot->getValue<Int64>("snapshot-id") == current_snapshot_id)
-        {
-            const auto path = snapshot->getValue<String>("manifest-list");
-            manifest_list_file = std::filesystem::path(configuration_ptr->getPath()) / "metadata" / std::filesystem::path(path).filename();
-            break;
-        }
-    }
-
-    auto ptr = std::make_unique<IcebergMetadata>(
-        object_storage, configuration_ptr, local_context, metadata_version, format_version, manifest_list_file, object);
-
-
-    return ptr;
-}
-
-/**
- * Manifest file has the following format: '/iceberg_data/db/table_name/metadata/c87bfec7-d36c-4075-ad04-600b6b0f2020-m0.avro'
- *
- * `manifest file` is different in format version V1 and V2 and has the following contents:
- *                        v1     v2
- * status                 req    req
- * snapshot_id            req    opt
- * sequence_number               opt
- * file_sequence_number          opt
- * data_file              req    req
- * Example format version V1:
- * ┌─status─┬─────────snapshot_id─┬─data_file───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
- * │      1 │ 2819310504515118887 │ ('/iceberg_data/db/table_name/data/00000-1-3edca534-15a0-4f74-8a28-4733e0bf1270-00001.parquet','PARQUET',(),100,1070,67108864,[(1,233),(2,210)],[(1,100),(2,100)],[(1,0),(2,0)],[],[(1,'\0'),(2,'0')],[(1,'c'),(2,'99')],NULL,[4],0) │
- * └────────┴─────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
- * Example format version V2:
- * ┌─status─┬─────────snapshot_id─┬─sequence_number─┬─file_sequence_number─┬─data_file───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
- * │      1 │ 5887006101709926452 │            ᴺᵁᴸᴸ │                 ᴺᵁᴸᴸ │ (0,'/iceberg_data/db/table_name/data/00000-1-c8045c90-8799-4eac-b957-79a0484e223c-00001.parquet','PARQUET',(),100,1070,[(1,233),(2,210)],[(1,100),(2,100)],[(1,0),(2,0)],[],[(1,'\0'),(2,'0')],[(1,'c'),(2,'99')],NULL,[4],[],0) │
- * └────────┴─────────────────────┴─────────────────┴──────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
- * In case of partitioned data we'll have extra directory partition=value:
- * ─status─┬─────────snapshot_id─┬─data_file──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
- * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=0/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00001.parquet','PARQUET',(0),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0\0'),(2,'1')],[(1,'\0\0\0\0\0\0\0\0'),(2,'1')],NULL,[4],0) │
- * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=1/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00002.parquet','PARQUET',(1),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'2')],[(1,'\0\0\0\0\0\0\0'),(2,'2')],NULL,[4],0) │
- * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=2/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00003.parquet','PARQUET',(2),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'3')],[(1,'\0\0\0\0\0\0\0'),(2,'3')],NULL,[4],0) │
- * └────────┴─────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
- */
-
-
-Strings IcebergMetadata::getDataFiles() const
-{
-    std::lock_guard lock(get_data_files_mutex);
-    if (!data_files.empty())
-        return data_files;
-
-    auto configuration_ptr = configuration.lock();
-    Strings manifest_files;
-    if (manifest_list_file.empty())
-        return data_files;
-
-    LOG_TEST(log, "Collect manifest files from manifest list {}", manifest_list_file);
-
-    auto context = getContext();
-    StorageObjectStorageSource::ObjectInfo object_info(manifest_list_file);
-    auto manifest_list_buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, context, log);
-    auto manifest_list_file_reader = std::make_unique<avro::DataFileReaderBase>(std::make_unique<AvroInputStreamReadBufferAdapter>(*manifest_list_buf));
-
-    auto data_type = AvroSchemaReader::avroNodeToDataType(manifest_list_file_reader->dataSchema().root()->leafAt(0));
-    Block header{{data_type->createColumn(), data_type, "manifest_path"}};
-    auto columns = parseAvro(*manifest_list_file_reader, header, getFormatSettings(context));
-    auto & col = columns.at(0);
-
-    if (col->getDataType() != TypeIndex::String)
-    {
-        throw Exception(
-            ErrorCodes::ILLEGAL_COLUMN,
-            "The parsed column from Avro file of `manifest_path` field should be String type, got {}",
-            col->getFamilyName());
-    }
-
-    const auto * col_str = typeid_cast<ColumnString *>(col.get());
-    for (size_t i = 0; i < col_str->size(); ++i)
-    {
-        const auto file_path = col_str->getDataAt(i).toView();
-        const auto filename = std::filesystem::path(file_path).filename();
-        manifest_files.emplace_back(std::filesystem::path(configuration_ptr->getPath()) / "metadata" / filename);
-    }
-
-    LOG_TEST(log, "Collect data files");
-    for (const auto & manifest_file : manifest_files)
-    {
-        LOG_TEST(log, "Process manifest file {}", manifest_file);
-
-        StorageObjectStorageSource::ObjectInfo manifest_object_info(manifest_file);
-        auto buffer = StorageObjectStorageSource::createReadBuffer(manifest_object_info, object_storage, context, log);
-        auto manifest_file_reader = std::make_unique<avro::DataFileReaderBase>(std::make_unique<AvroInputStreamReadBufferAdapter>(*buffer));
-
-        /// Manifest file should always have table schema in avro file metadata. By now we don't support tables with evolved schema,
-        /// so we should check if all manifest files have the same schema as in table metadata.
-        auto avro_metadata = manifest_file_reader->metadata();
-        auto avro_schema_it = avro_metadata.find("schema");
-        if (avro_schema_it == avro_metadata.end())
-            throw Exception(
-                ErrorCodes::BAD_ARGUMENTS,
-                "Cannot read Iceberg table: manifest file {} doesn't have table schema in its metadata",
-                manifest_file);
-        std::vector<uint8_t> schema_json = avro_schema_it->second;
-        String schema_json_string = String(reinterpret_cast<char *>(schema_json.data()), schema_json.size());
-        Poco::JSON::Parser parser;
-        Poco::Dynamic::Var json = parser.parse(schema_json_string);
-        const Poco::JSON::Object::Ptr & schema_object = json.extract<Poco::JSON::Object::Ptr>();
-        Int32 schema_object_id = schema_object->getValue<int>("schema-id");
-        avro::NodePtr root_node = manifest_file_reader->dataSchema().root();
-        size_t leaves_num = root_node->leaves();
-        size_t expected_min_num = format_version == 1 ? 3 : 2;
-        if (leaves_num < expected_min_num)
-        {
-            throw Exception(
-                ErrorCodes::BAD_ARGUMENTS,
-                "Unexpected number of columns {}. Expected at least {}",
-                root_node->leaves(), expected_min_num);
-        }
-
-        avro::NodePtr status_node = root_node->leafAt(0);
-        if (status_node->type() != avro::Type::AVRO_INT)
-        {
-            throw Exception(
-                ErrorCodes::ILLEGAL_COLUMN,
-                "The parsed column from Avro file of `status` field should be Int type, got {}",
-                magic_enum::enum_name(status_node->type()));
-        }
-
-        avro::NodePtr data_file_node = root_node->leafAt(static_cast<int>(leaves_num) - 1);
-        if (data_file_node->type() != avro::Type::AVRO_RECORD)
-        {
-            throw Exception(
-                ErrorCodes::ILLEGAL_COLUMN,
-                "The parsed column from Avro file of `data_file` field should be Tuple type, got {}",
-                magic_enum::enum_name(data_file_node->type()));
-        }
-
-        auto status_col_data_type = AvroSchemaReader::avroNodeToDataType(status_node);
-        auto data_col_data_type = AvroSchemaReader::avroNodeToDataType(data_file_node);
-        Block manifest_file_header
-            = {{status_col_data_type->createColumn(), status_col_data_type, "status"},
-               {data_col_data_type->createColumn(), data_col_data_type, "data_file"}};
-
-        columns = parseAvro(*manifest_file_reader, manifest_file_header, getFormatSettings(getContext()));
-        if (columns.size() != 2)
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Unexpected number of columns. Expected 2, got {}", columns.size());
-
-        if (columns.at(0)->getDataType() != TypeIndex::Int32)
-        {
-            throw Exception(
-                ErrorCodes::ILLEGAL_COLUMN,
-                "The parsed column from Avro file of `status` field should be Int32 type, got {}",
-                columns.at(0)->getFamilyName());
-        }
-        if (columns.at(1)->getDataType() != TypeIndex::Tuple)
-        {
-            throw Exception(
-                ErrorCodes::ILLEGAL_COLUMN,
-                "The parsed column from Avro file of `file_path` field should be Tuple type, got {}",
-                columns.at(1)->getFamilyName());
-        }
-
-        const auto * status_int_column = assert_cast<ColumnInt32 *>(columns.at(0).get());
-        const auto & data_file_tuple_type = assert_cast<const DataTypeTuple &>(*data_col_data_type.get());
-        const auto * data_file_tuple_column = assert_cast<ColumnTuple *>(columns.at(1).get());
-
-        if (status_int_column->size() != data_file_tuple_column->size())
-        {
-            throw Exception(
-                ErrorCodes::ILLEGAL_COLUMN,
-                "The parsed column from Avro file of `file_path` and `status` have different rows number: {} and {}",
-                status_int_column->size(),
-                data_file_tuple_column->size());
-        }
-
-        ColumnPtr file_path_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName("file_path"));
-
-        if (file_path_column->getDataType() != TypeIndex::String)
-        {
-            throw Exception(
-                ErrorCodes::ILLEGAL_COLUMN,
-                "The parsed column from Avro file of `file_path` field should be String type, got {}",
-                file_path_column->getFamilyName());
-        }
-
-        const auto * file_path_string_column = assert_cast<const ColumnString *>(file_path_column.get());
-
-        ColumnPtr content_column;
-        const ColumnInt32 * content_int_column = nullptr;
-        if (format_version == 2)
-        {
-            content_column = data_file_tuple_column->getColumnPtr(data_file_tuple_type.getPositionByName("content"));
-            if (content_column->getDataType() != TypeIndex::Int32)
-            {
-                throw Exception(
-                    ErrorCodes::ILLEGAL_COLUMN,
-                    "The parsed column from Avro file of `content` field should be Int type, got {}",
-                    content_column->getFamilyName());
-            }
-
-            content_int_column = assert_cast<const ColumnInt32 *>(content_column.get());
-        }
-
-        for (size_t i = 0; i < data_file_tuple_column->size(); ++i)
-        {
-            if (format_version == 2)
-            {
-                Int32 content_type = content_int_column->getElement(i);
-                if (DataFileContent(content_type) != DataFileContent::DATA)
-                    throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Cannot read Iceberg table: positional and equality deletes are not supported");
-            }
-
-            const auto status = status_int_column->getInt(i);
-            const auto data_path = std::string(file_path_string_column->getDataAt(i).toView());
-            const auto pos = data_path.find(configuration_ptr->getPath());
-            if (pos == std::string::npos)
-                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected to find {} in data path: {}", configuration_ptr->getPath(), data_path);
-
-            const auto file_path = data_path.substr(pos);
-
-            if (ManifestEntryStatus(status) == ManifestEntryStatus::DELETED)
-            {
-                LOG_TEST(log, "Processing delete file for path: {}", file_path);
-                chassert(schema_id_by_data_file.contains(file_path) == 0);
-            }
-            else
-            {
-                LOG_TEST(log, "Processing data file for path: {}", file_path);
-                schema_id_by_data_file[file_path] = schema_object_id;
-            }
-        }
-
-        schema_processor.addIcebergTableSchema(schema_object);
-    }
-
-    for (const auto & [file_path, schema_object_id] : schema_id_by_data_file)
-    {
-        data_files.emplace_back(file_path);
-    }
-    return data_files;
-}
-
-}
-
-#endif
diff --git a/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h
deleted file mode 100644
index fb5a7800228..00000000000
--- a/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h
+++ /dev/null
@@ -1,243 +0,0 @@
-#pragma once
-
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <unordered_map>
-#include "config.h"
-
-#if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format.
-
-#include <Interpreters/Context_fwd.h>
-#include <Core/Types.h>
-#include <Disks/ObjectStorages/IObjectStorage.h>
-#include <Storages/ObjectStorage/StorageObjectStorage.h>
-#include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
-
-#include <Poco/JSON/Array.h>
-#include <Poco/JSON/Object.h>
-#include <Poco/JSON/Parser.h>
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int BAD_ARGUMENTS;
-}
-
-/**
- * Iceberg supports the next data types (see https://iceberg.apache.org/spec/#schemas-and-data-types):
- * - Primitive types:
- *   - boolean
- *   - int
- *   - long
- *   - float
- *   - double
- *   - decimal(P, S)
- *   - date
- *   - time (time of day in microseconds since midnight)
- *   - timestamp (in microseconds since 1970-01-01)
- *   - timestamptz (timestamp with timezone, stores values in UTC timezone)
- *   - string
- *   - uuid
- *   - fixed(L) (fixed-length byte array of length L)
- *   - binary
- * - Complex types:
- *   - struct(field1: Type1, field2: Type2, ...) (tuple of typed values)
- *   - list(nested_type)
- *   - map(Key, Value)
- *
- * Example of table schema in metadata:
- * {
- *     "type" : "struct",
- *     "schema-id" : 0,
- *     "fields" : [
- *     {
- *         "id" : 1,
- *         "name" : "id",
- *         "required" : false,
- *         "type" : "long"
- *     },
- *     {
- *         "id" : 2,
- *         "name" : "array",
- *         "required" : false,
- *         "type" : {
- *             "type" : "list",
- *             "element-id" : 5,
- *             "element" : "int",
- *             "element-required" : false
- *     },
- *     {
- *         "id" : 3,
- *         "name" : "data",
- *         "required" : false,
- *         "type" : "binary"
- *     }
- * }
- */
-class IcebergSchemaProcessor
-{
-    using Node = ActionsDAG::Node;
-
-public:
-    void addIcebergTableSchema(Poco::JSON::Object::Ptr schema_ptr);
-    std::shared_ptr<NamesAndTypesList> getClickhouseTableSchemaById(Int32 id);
-    std::shared_ptr<const ActionsDAG> getSchemaTransformationDagByIds(Int32 old_id, Int32 new_id);
-
-private:
-    std::unordered_map<Int32, Poco::JSON::Object::Ptr> iceberg_table_schemas_by_ids;
-    std::unordered_map<Int32, std::shared_ptr<NamesAndTypesList>> clickhouse_table_schemas_by_ids;
-    std::map<std::pair<Int32, Int32>, std::shared_ptr<ActionsDAG>> transform_dags_by_ids;
-
-    NamesAndTypesList getSchemaType(const Poco::JSON::Object::Ptr & schema);
-    DataTypePtr getComplexTypeFromObject(const Poco::JSON::Object::Ptr & type);
-    DataTypePtr getFieldType(const Poco::JSON::Object::Ptr & field, const String & type_key, bool required);
-    DataTypePtr getSimpleType(const String & type_name);
-
-    bool allowPrimitiveTypeConversion(const String & old_type, const String & new_type);
-    const Node * getDefaultNodeForField(const Poco::JSON::Object::Ptr & field);
-
-    std::shared_ptr<ActionsDAG> getSchemaTransformationDag(
-        const Poco::JSON::Object::Ptr & old_schema, const Poco::JSON::Object::Ptr & new_schema, Int32 old_id, Int32 new_id);
-
-    std::mutex mutex;
-};
-
-
-/**
- * Useful links:
- * - https://iceberg.apache.org/spec/
- *
- * Iceberg has two format versions, v1 and v2. The content of metadata files depends on the version.
- *
- * Unlike DeltaLake, Iceberg has several metadata layers: `table metadata`, `manifest list` and `manifest_files`.
- * Metadata file - json file.
- * Manifest list – an Avro file that lists manifest files; one per snapshot.
- * Manifest file – an Avro file that lists data or delete files; a subset of a snapshot.
- * All changes to table state create a new metadata file and replace the old metadata with an atomic swap.
- *
- * In order to find out which data files to read, we need to find the `manifest list`
- * which corresponds to the latest snapshot. We find it by checking a list of snapshots
- * in metadata's "snapshots" section.
- *
- * Example of metadata.json file.
- * {
- *     "format-version" : 1,
- *     "table-uuid" : "ca2965ad-aae2-4813-8cf7-2c394e0c10f5",
- *     "location" : "/iceberg_data/db/table_name",
- *     "last-updated-ms" : 1680206743150,
- *     "last-column-id" : 2,
- *     "schema" : { "type" : "struct", "schema-id" : 0, "fields" : [ {<field1_info>}, {<field2_info>}, ... ] },
- *     "current-schema-id" : 0,
- *     "schemas" : [ ],
- *     ...
- *     "current-snapshot-id" : 2819310504515118887,
- *     "refs" : { "main" : { "snapshot-id" : 2819310504515118887, "type" : "branch" } },
- *     "snapshots" : [ {
- *       "snapshot-id" : 2819310504515118887,
- *       "timestamp-ms" : 1680206743150,
- *       "summary" : {
- *         "operation" : "append", "spark.app.id" : "local-1680206733239",
- *         "added-data-files" : "1", "added-records" : "100",
- *         "added-files-size" : "1070", "changed-partition-count" : "1",
- *         "total-records" : "100", "total-files-size" : "1070", "total-data-files" : "1", "total-delete-files" : "0",
- *         "total-position-deletes" : "0", "total-equality-deletes" : "0"
- *       },
- *       "manifest-list" : "/iceberg_data/db/table_name/metadata/snap-2819310504515118887-1-c87bfec7-d36c-4075-ad04-600b6b0f2020.avro",
- *       "schema-id" : 0
- *     } ],
- *     "statistics" : [ ],
- *     "snapshot-log" : [ ... ],
- *     "metadata-log" : [ ]
- * }
- */
-class IcebergMetadata : public IDataLakeMetadata, private WithContext
-{
-public:
-    using ConfigurationObserverPtr = StorageObjectStorage::ConfigurationObserverPtr;
-
-    static constexpr auto name = "Iceberg";
-
-    IcebergMetadata(
-        ObjectStoragePtr object_storage_,
-        ConfigurationObserverPtr configuration_,
-        const DB::ContextPtr & context_,
-        Int32 metadata_version_,
-        Int32 format_version_,
-        String manifest_list_file_,
-        const Poco::JSON::Object::Ptr& object);
-
-    /// Get data files. On first request it reads manifest_list file and iterates through manifest files to find all data files.
-    /// All subsequent calls will return saved list of files (because it cannot be changed without changing metadata file)
-    Strings getDataFiles() const override;
-
-    /// Get table schema parsed from metadata.
-    NamesAndTypesList getTableSchema() const override { return schema; }
-
-    const std::unordered_map<String, String> & getColumnNameToPhysicalNameMapping() const override { return column_name_to_physical_name; }
-
-    const DataLakePartitionColumns & getPartitionColumns() const override { return partition_columns; }
-
-    bool operator ==(const IDataLakeMetadata & other) const override
-    {
-        const auto * iceberg_metadata = dynamic_cast<const IcebergMetadata *>(&other);
-        return iceberg_metadata && getVersion() == iceberg_metadata->getVersion();
-    }
-
-    static DataLakeMetadataPtr
-    create(const ObjectStoragePtr & object_storage, const ConfigurationObserverPtr & configuration, const ContextPtr & local_context);
-
-    size_t getVersion() const { return metadata_version; }
-
-    std::shared_ptr<NamesAndTypesList> getInitialSchemaByPath(const String & data_path) const override
-    {
-        auto version_if_outdated = getSchemaVersionByFileIfOutdated(data_path);
-        return version_if_outdated.has_value() ? schema_processor.getClickhouseTableSchemaById(version_if_outdated.value()) : nullptr;
-    }
-
-    std::shared_ptr<const ActionsDAG> getSchemaTransformer(const String & data_path) const override
-    {
-        auto version_if_outdated = getSchemaVersionByFileIfOutdated(data_path);
-        return version_if_outdated.has_value()
-            ? schema_processor.getSchemaTransformationDagByIds(version_if_outdated.value(), current_schema_id)
-            : nullptr;
-    }
-
-    bool supportsExternalMetadataChange() const override { return true; }
-
-private:
-    mutable std::unordered_map<String, Int32> schema_id_by_data_file;
-
-    const ObjectStoragePtr object_storage;
-    const ConfigurationObserverPtr configuration;
-    Int32 metadata_version;
-    Int32 format_version;
-    String manifest_list_file;
-    Int32 current_schema_id;
-    mutable Strings data_files;
-    std::unordered_map<String, String> column_name_to_physical_name;
-    DataLakePartitionColumns partition_columns;
-    NamesAndTypesList schema;
-    mutable IcebergSchemaProcessor schema_processor;
-    LoggerPtr log;
-
-    mutable std::mutex get_data_files_mutex;
-
-    std::optional<Int32> getSchemaVersionByFileIfOutdated(String data_path) const
-    {
-        auto schema_id = schema_id_by_data_file.find(data_path);
-        if (schema_id == schema_id_by_data_file.end())
-        {
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot find schema version for data file: {}", data_path);
-        }
-        if (schema_id->second == current_schema_id)
-            return std::nullopt;
-        return std::optional{schema_id->second};
-    }
-};
-
-}
-
-#endif
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
index 6b6920d5547..b3295f4177a 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
@@ -1,4 +1,4 @@
-#include <Storages/ObjectStorage/StorageObjectStorage.h>
+    #include <Storages/ObjectStorage/StorageObjectStorage.h>
 #include <Core/ColumnWithTypeAndName.h>
 
 #include <Core/Settings.h>
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
index af8dfb0b6de..1ebda25ceb1 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
@@ -506,6 +506,7 @@ std::unique_ptr<ReadBufferFromFileBase> StorageObjectStorageSource::createReadBu
     std::unique_ptr<ReadBufferFromFileBase> impl;
     if (use_cache)
     {
+        chassert(object_info.metadata.has_value());
         if (object_info.metadata->etag.empty())
         {
             LOG_WARNING(log, "Cannot use filesystem cache, no etag specified");
@@ -540,9 +541,13 @@ std::unique_ptr<ReadBufferFromFileBase> StorageObjectStorageSource::createReadBu
                 /* read_until_position */std::nullopt,
                 context_->getFilesystemCacheLog());
 
-            LOG_TEST(log, "Using filesystem cache `{}` (path: {}, etag: {}, hash: {})",
-                     filesystem_cache_name, object_info.getPath(),
-                     object_info.metadata->etag, toString(hash.get128()));
+            LOG_TEST(
+                log,
+                "Using filesystem cache `{}` (path: {}, etag: {}, hash: {})",
+                filesystem_cache_name,
+                object_info.getPath(),
+                object_info.metadata->etag,
+                toString(hash.get128()));
         }
     }
 
diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py
index cd79aacd534..29188be3e5b 100644
--- a/tests/integration/test_storage_iceberg/test.py
+++ b/tests/integration/test_storage_iceberg/test.py
@@ -720,7 +720,6 @@ def test_delete_files(started_cluster, format_version, storage_type):
     )
 
     assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 0
-    assert instance.contains_in_log("Processing delete file for path")
 
     write_iceberg_from_df(
         spark,

From 266e854560e2f6b03d59efc734e4e2ad95f30f98 Mon Sep 17 00:00:00 2001
From: Sema Checherinda <Sema.Checherinda@clickhouse.com>
Date: Fri, 6 Dec 2024 16:23:30 +0100
Subject: [PATCH 25/41] add sendProgress between read blocks

---
 src/Server/TCPHandler.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index 5dd31b8af98..e2dc77c9ce1 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -37,7 +37,6 @@
 #include <Poco/Net/NetException.h>
 #include <Poco/Net/SocketAddress.h>
 #include <Poco/Util/LayeredConfiguration.h>
-#include "Common/StackTrace.h"
 #include <Common/Exception.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/CurrentThread.h>
@@ -1010,6 +1009,7 @@ void TCPHandler::readData(QueryState & state)
 
     while (receivePacketsExpectData(state))
     {
+        sendProgress(state);
         sendLogs(state);
         sendInsertProfileEvents(state);
     }
@@ -1114,6 +1114,7 @@ void TCPHandler::processInsertQuery(QueryState & state)
             {
                 executor.push(std::move(state.block_for_insert));
 
+                sendProgress(state);
                 sendLogs(state);
                 sendInsertProfileEvents(state);
             }

From ccde2a3cddd441f178d24b1b6475c1e07d692459 Mon Sep 17 00:00:00 2001
From: Sema Checherinda <Sema.Checherinda@clickhouse.com>
Date: Fri, 6 Dec 2024 17:00:03 +0100
Subject: [PATCH 26/41] remove sendProgress call when reading data

---
 src/Server/TCPHandler.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index e2dc77c9ce1..fff9045229c 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -1009,7 +1009,6 @@ void TCPHandler::readData(QueryState & state)
 
     while (receivePacketsExpectData(state))
     {
-        sendProgress(state);
         sendLogs(state);
         sendInsertProfileEvents(state);
     }
@@ -1114,7 +1113,6 @@ void TCPHandler::processInsertQuery(QueryState & state)
             {
                 executor.push(std::move(state.block_for_insert));
 
-                sendProgress(state);
                 sendLogs(state);
                 sendInsertProfileEvents(state);
             }

From c28ae71b3fa067c310a32f8b04707dd50f1288e7 Mon Sep 17 00:00:00 2001
From: divanik <dan.ivanik@clickhouse.com>
Date: Mon, 9 Dec 2024 11:00:29 +0000
Subject: [PATCH 27/41] Add fast update

---
 .../DataLakes/DataLakeConfiguration.h         |  35 ++++--
 .../DataLakes/IDataLakeMetadata.h             |   2 +
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 111 ++++++++++++------
 .../DataLakes/Iceberg/IcebergMetadata.h       |  28 +++--
 4 files changed, 120 insertions(+), 56 deletions(-)

diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
index f32631c4438..52b5a553de1 100644
--- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
+++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
@@ -11,6 +11,7 @@
 #include <Storages/ObjectStorage/S3/Configuration.h>
 #include <Storages/ObjectStorage/StorageObjectStorage.h>
 #include <Storages/StorageFactory.h>
+#include "Common/DateLUT.h"
 #include <Common/logger_useful.h>
 #include "Storages/ColumnsDescription.h"
 
@@ -45,9 +46,8 @@ public:
     void update(ObjectStoragePtr object_storage, ContextPtr local_context) override
     {
         BaseStorageConfiguration::update(object_storage, local_context);
-        auto new_metadata = DataLakeMetadata::create(object_storage, weak_from_this(), local_context);
 
-        if (!current_metadata || (*current_metadata != *new_metadata))
+        if (updateMetadataObjectIfNeeded(object_storage, local_context))
         {
             if (hasExternalDynamicMetadata())
             {
@@ -57,7 +57,6 @@ public:
             }
             else
             {
-                current_metadata = std::move(new_metadata);
                 BaseStorageConfiguration::setPaths(current_metadata->getDataFiles());
                 BaseStorageConfiguration::setPartitionColumns(current_metadata->getPartitionColumns());
             }
@@ -99,11 +98,8 @@ public:
     ColumnsDescription updateAndGetCurrentSchema(ObjectStoragePtr object_storage, ContextPtr context) override
     {
         BaseStorageConfiguration::update(object_storage, context);
-        auto new_metadata = DataLakeMetadata::create(object_storage, weak_from_this(), context);
-
-        if (!current_metadata || (*current_metadata != *new_metadata))
+        if (updateMetadataObjectIfNeeded(object_storage, context))
         {
-            current_metadata = std::move(new_metadata);
             BaseStorageConfiguration::setPaths(current_metadata->getDataFiles());
             BaseStorageConfiguration::setPartitionColumns(current_metadata->getPartitionColumns());
         }
@@ -137,6 +133,31 @@ private:
         }
         return info;
     }
+
+    bool updateMetadataObjectIfNeeded(ObjectStoragePtr object_storage, ContextPtr context)
+    {
+        if (!current_metadata)
+        {
+            current_metadata = DataLakeMetadata::create(object_storage, weak_from_this(), context);
+            return true;
+        }
+
+        if (current_metadata->supportsUpdate())
+        {
+            return current_metadata->update(context);
+        }
+
+        auto new_metadata = DataLakeMetadata::create(object_storage, weak_from_this(), context);
+        if (*current_metadata != *new_metadata)
+        {
+            current_metadata = std::move(new_metadata);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
 };
 
 using IcebergMetadata = Iceberg::IcebergMetadata;
diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
index de6324369c3..980a0d5ae7d 100644
--- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h
@@ -19,6 +19,8 @@ public:
     virtual std::shared_ptr<NamesAndTypesList> getInitialSchemaByPath(const String &) const { return {}; }
     virtual std::shared_ptr<const ActionsDAG> getSchemaTransformer(const String &) const { return {}; }
     virtual bool supportsExternalMetadataChange() const { return false; }
+    virtual bool supportsUpdate() const { return false; }
+    virtual bool update(const ContextPtr &) { return false; }
 };
 using DataLakeMetadataPtr = std::unique_ptr<IDataLakeMetadata>;
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 42924179246..1d5da6a1a09 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -89,20 +89,21 @@ IcebergMetadata::IcebergMetadata(
     const DB::ContextPtr & context_,
     Int32 metadata_version_,
     Int32 format_version_,
-    const String & manifest_list_file_,
     const Poco::JSON::Object::Ptr & object)
     : WithContext(context_)
     , object_storage(std::move(object_storage_))
     , configuration(std::move(configuration_))
     , schema_processor(IcebergSchemaProcessor())
     , log(getLogger("IcebergMetadata"))
-    , metadata_version(metadata_version_)
+    , current_metadata_version(metadata_version_)
     , format_version(format_version_)
-    , current_snapshot(manifest_list_file_.empty() ? std::nullopt : std::optional{getSnapshot(manifest_list_file_)})
 {
-    auto schema_id = parseTableSchema(object, schema_processor, log);
-    schema = *(schema_processor.getClickhouseTableSchemaById(schema_id));
-    current_schema_id = schema_id;
+    auto manifest_list_file = getRelevantManifestList(object);
+    if (manifest_list_file)
+    {
+        current_snapshot = getSnapshot(manifest_list_file.value());
+    }
+    current_schema_id = parseTableSchema(object, schema_processor, log);
 }
 
 std::pair<Poco::JSON::Object::Ptr, Int32> parseTableSchemaV2Method(const Poco::JSON::Object::Ptr & metadata_object)
@@ -235,6 +236,69 @@ getMetadataFileAndVersion(const ObjectStoragePtr & object_storage, const Storage
     return *std::max_element(metadata_files_with_versions.begin(), metadata_files_with_versions.end());
 }
 
+Poco::JSON::Object::Ptr IcebergMetadata::readJson(const String & metadata_file_path, const ContextPtr & local_context) const
+{
+    StorageObjectStorageSource::ObjectInfo object_info(metadata_file_path);
+    auto buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, local_context, log);
+
+    String json_str;
+    readJSONObjectPossiblyInvalid(json_str, *buf);
+
+    Poco::JSON::Parser parser; /// For some reason base/base/JSON.h can not parse this json file
+    Poco::Dynamic::Var json = parser.parse(json_str);
+    return json.extract<Poco::JSON::Object::Ptr>();
+}
+
+bool IcebergMetadata::update(const ContextPtr & local_context)
+{
+    auto configuration_ptr = configuration.lock();
+
+    const auto [metadata_version, metadata_file_path] = getMetadataFileAndVersion(object_storage, *configuration_ptr);
+
+    auto metadata_object = readJson(metadata_file_path, local_context);
+
+    chassert(format_version == metadata_object->getValue<int>("format-version"));
+
+    if (metadata_version == current_metadata_version)
+        return false;
+
+    auto manifest_list_file = getRelevantManifestList(metadata_object);
+    if (manifest_list_file && (!current_snapshot.has_value() || (manifest_list_file.value() != current_snapshot->getName())))
+    {
+        current_snapshot = getSnapshot(manifest_list_file.value());
+    }
+    current_schema_id = parseTableSchema(metadata_object, schema_processor, log);
+    return true;
+}
+
+std::optional<String> IcebergMetadata::getRelevantManifestList(const Poco::JSON::Object::Ptr & metadata)
+{
+    auto configuration_ptr = configuration.lock();
+
+    auto snapshots = metadata->get("snapshots").extract<Poco::JSON::Array::Ptr>();
+
+    String manifest_list_file;
+    auto current_snapshot_id = metadata->getValue<Int64>("current-snapshot-id");
+
+    LOG_DEBUG(&Poco::Logger::get("IcebergMetadata initialize"), "Current snapshot id {}", current_snapshot_id);
+
+    for (size_t i = 0; i < snapshots->size(); ++i)
+    {
+        const auto snapshot = snapshots->getObject(static_cast<UInt32>(i));
+        LOG_DEBUG(
+            &Poco::Logger::get("IcebergMetadata initialize"),
+            "Iterationg on snapshot with id {}",
+            snapshot->getValue<Int64>("snapshot-id"));
+
+        if (snapshot->getValue<Int64>("snapshot-id") == current_snapshot_id)
+        {
+            const auto path = snapshot->getValue<String>("manifest-list");
+            return std::filesystem::path(path).filename();
+        }
+    }
+    return std::nullopt;
+}
+
 
 DataLakeMetadataPtr IcebergMetadata::create(
     const ObjectStoragePtr & object_storage, const ConfigurationObserverPtr & configuration, const ContextPtr & local_context)
@@ -260,32 +324,8 @@ DataLakeMetadataPtr IcebergMetadata::create(
 
     auto format_version = object->getValue<int>("format-version");
 
-    auto snapshots = object->get("snapshots").extract<Poco::JSON::Array::Ptr>();
-
-    String manifest_list_file;
-    auto current_snapshot_id = object->getValue<Int64>("current-snapshot-id");
-
-    LOG_DEBUG(&Poco::Logger::get("IcebergMetadata initialize"), "Current snapshot id {}", current_snapshot_id);
-
-    for (size_t i = 0; i < snapshots->size(); ++i)
-    {
-        const auto snapshot = snapshots->getObject(static_cast<UInt32>(i));
-        LOG_DEBUG(
-            &Poco::Logger::get("IcebergMetadata initialize"),
-            "Iterationg on snapshot with id {}",
-            snapshot->getValue<Int64>("snapshot-id"));
-
-        if (snapshot->getValue<Int64>("snapshot-id") == current_snapshot_id)
-        {
-            const auto path = snapshot->getValue<String>("manifest-list");
-            manifest_list_file = std::filesystem::path(configuration_ptr->getPath()) / "metadata" / std::filesystem::path(path).filename();
-            break;
-        }
-    }
-
-    auto ptr = std::make_unique<IcebergMetadata>(
-        object_storage, configuration_ptr, local_context, metadata_version, format_version, manifest_list_file, object);
-
+    auto ptr
+        = std::make_unique<IcebergMetadata>(object_storage, configuration_ptr, local_context, metadata_version, format_version, object);
 
     return ptr;
 }
@@ -323,7 +363,8 @@ ManifestList IcebergMetadata::initializeManifestList(const String & manifest_lis
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration is expired");
 
     auto context = getContext();
-    StorageObjectStorageSource::ObjectInfo object_info(manifest_list_file);
+    StorageObjectStorageSource::ObjectInfo object_info(
+        std::filesystem::path(configuration_ptr->getPath()) / "metadata" / manifest_list_file);
     auto manifest_list_buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, context, log);
 
     LOG_DEBUG(&Poco::Logger::get("initializeManifestList"), "Parse manifest list {}", manifest_list_file);
@@ -399,9 +440,7 @@ IcebergSnapshot IcebergMetadata::getSnapshot(const String & manifest_list_file)
 
 Strings IcebergMetadata::getDataFiles() const
 {
-    std::lock_guard lock(get_data_files_mutex);
-    if (!data_files.empty())
-        return data_files;
+    Strings data_files;
 
     if (!current_snapshot)
     {
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index 2091649a0ec..b4113b6dc26 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -48,7 +48,6 @@ public:
         const DB::ContextPtr & context_,
         Int32 metadata_version_,
         Int32 format_version_,
-        const String & manifest_list_file_,
         const Poco::JSON::Object::Ptr & object);
 
 
@@ -57,7 +56,7 @@ public:
     Strings getDataFiles() const override;
 
     /// Get table schema parsed from metadata.
-    NamesAndTypesList getTableSchema() const override { return schema; }
+    NamesAndTypesList getTableSchema() const override { return *schema_processor.getClickhouseTableSchemaById(current_schema_id); }
 
     const std::unordered_map<String, String> & getColumnNameToPhysicalNameMapping() const override { return column_name_to_physical_name; }
 
@@ -72,7 +71,7 @@ public:
     static DataLakeMetadataPtr
     create(const ObjectStoragePtr & object_storage, const ConfigurationObserverPtr & configuration, const ContextPtr & local_context);
 
-    size_t getVersion() const { return metadata_version; }
+    size_t getVersion() const { return current_metadata_version; }
 
     std::shared_ptr<NamesAndTypesList> getInitialSchemaByPath(const String & data_path) const override
     {
@@ -90,12 +89,15 @@ public:
 
     bool supportsExternalMetadataChange() const override { return true; }
 
+    bool supportsUpdate() const override { return true; }
+
+    bool update(const ContextPtr & local_context) override;
+
 private:
     using ManifestEntryByDataFile = std::unordered_map<String, ManifestFileEntry>;
 
     const ObjectStoragePtr object_storage;
     const ConfigurationObserverPtr configuration;
-
     mutable IcebergSchemaProcessor schema_processor;
     LoggerPtr log;
 
@@ -103,19 +105,11 @@ private:
     mutable ManifestListsByName manifest_lists_by_name;
     mutable ManifestEntryByDataFile manifest_entry_by_data_file;
 
-    Int32 metadata_version;
+    Int32 current_metadata_version;
     Int32 format_version;
     Int32 current_schema_id;
-
     std::optional<IcebergSnapshot> current_snapshot;
 
-    mutable Strings data_files;
-    std::unordered_map<String, String> column_name_to_physical_name;
-    DataLakePartitionColumns partition_columns;
-    NamesAndTypesList schema;
-
-    mutable std::mutex get_data_files_mutex;
-
     ManifestList initializeManifestList(const String & manifest_list_file) const;
 
     IcebergSnapshot getSnapshot(const String & manifest_list_file) const;
@@ -136,6 +130,14 @@ private:
     ManifestFileEntry getManifestFile(const String & manifest_file) const;
 
     ManifestFileEntry initializeManifestFile(const String & filename, const ConfigurationPtr & configuration_ptr) const;
+
+    std::optional<String> getRelevantManifestList(const Poco::JSON::Object::Ptr & metadata);
+
+    Poco::JSON::Object::Ptr readJson(const String & metadata_file_path, const ContextPtr & local_context) const;
+
+    //Fields are needed only for providing dynamic polymorphism
+    std::unordered_map<String, String> column_name_to_physical_name;
+    DataLakePartitionColumns partition_columns;
 };
 
 }

From bc93760c5ac9ddce2028fbdff7eff62114d22c98 Mon Sep 17 00:00:00 2001
From: divanik <dan.ivanik@clickhouse.com>
Date: Mon, 9 Dec 2024 12:04:31 +0000
Subject: [PATCH 28/41] Deal with includes

---
 .../DataLakes/DataLakeConfiguration.h         |  6 +-
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 59 +++++++------------
 .../DataLakes/Iceberg/IcebergMetadata.h       | 27 ++++-----
 .../DataLakes/Iceberg/ManifestFile.cpp        | 22 ++-----
 .../DataLakes/Iceberg/ManifestFile.h          |  6 +-
 .../DataLakes/Iceberg/ManifestFileImpl.h      | 24 +-------
 .../DataLakes/Iceberg/SchemaProcessor.cpp     | 14 +++--
 .../DataLakes/Iceberg/SchemaProcessor.h       |  9 ++-
 .../DataLakes/Iceberg/Snapshot.h              |  2 +-
 .../ObjectStorage/DataLakes/Iceberg/Utils.cpp | 22 +------
 .../ObjectStorage/DataLakes/Iceberg/Utils.h   | 19 +-----
 11 files changed, 65 insertions(+), 145 deletions(-)

diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
index 52b5a553de1..29bd7b992d5 100644
--- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
+++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
@@ -11,7 +11,6 @@
 #include <Storages/ObjectStorage/S3/Configuration.h>
 #include <Storages/ObjectStorage/StorageObjectStorage.h>
 #include <Storages/StorageFactory.h>
-#include "Common/DateLUT.h"
 #include <Common/logger_useful.h>
 #include "Storages/ColumnsDescription.h"
 
@@ -47,9 +46,11 @@ public:
     {
         BaseStorageConfiguration::update(object_storage, local_context);
 
+        bool existed = current_metadata != nullptr;
+
         if (updateMetadataObjectIfNeeded(object_storage, local_context))
         {
-            if (hasExternalDynamicMetadata())
+            if (hasExternalDynamicMetadata() && existed)
             {
                 throw Exception(
                     ErrorCodes::FORMAT_VERSION_TOO_OLD,
@@ -103,6 +104,7 @@ public:
             BaseStorageConfiguration::setPaths(current_metadata->getDataFiles());
             BaseStorageConfiguration::setPartitionColumns(current_metadata->getPartitionColumns());
         }
+
         return ColumnsDescription{current_metadata->getTableSchema()};
     }
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 1d5da6a1a09..94b646f30f5 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -1,45 +1,25 @@
-#include <memory>
-#include <Poco/Logger.h>
-#include "Common/Config/ConfigProcessor.h"
-#include "Common/DateLUT.h"
-#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
 #include "config.h"
 
 #if USE_AVRO
 
-#    include <Columns/ColumnString.h>
-#    include <Columns/ColumnTuple.h>
-#    include <Columns/IColumn.h>
-#    include <Core/Settings.h>
-#    include <DataTypes/DataTypeArray.h>
-#    include <DataTypes/DataTypeDate.h>
-#    include <DataTypes/DataTypeDateTime64.h>
-#    include <DataTypes/DataTypeFactory.h>
-#    include <DataTypes/DataTypeFixedString.h>
-#    include <DataTypes/DataTypeMap.h>
-#    include <DataTypes/DataTypeNullable.h>
-#    include <DataTypes/DataTypeString.h>
-#    include <DataTypes/DataTypeTuple.h>
-#    include <DataTypes/DataTypeUUID.h>
-#    include <DataTypes/DataTypesDecimal.h>
-#    include <DataTypes/DataTypesNumber.h>
-#    include <Formats/FormatFactory.h>
-#    include <IO/ReadBufferFromFileBase.h>
-#    include <IO/ReadBufferFromString.h>
-#    include <IO/ReadHelpers.h>
-#    include <Processors/Formats/Impl/AvroRowInputFormat.h>
-#    include <Storages/ObjectStorage/DataLakes/Common.h>
-#    include <Storages/ObjectStorage/StorageObjectStorageSource.h>
-#    include <Common/logger_useful.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnTuple.h>
+#include <Columns/IColumn.h>
+#include <Core/Settings.h>
+#include <Formats/FormatFactory.h>
+#include <IO/ReadBufferFromFileBase.h>
+#include <IO/ReadBufferFromString.h>
+#include <IO/ReadHelpers.h>
+#include <Processors/Formats/Impl/AvroRowInputFormat.h>
+#include <Storages/ObjectStorage/DataLakes/Common.h>
+#include <Storages/ObjectStorage/StorageObjectStorageSource.h>
+#include <Common/logger_useful.h>
 
-#    include <filesystem>
-#    include <sstream>
+#include "Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h"
+#include "Storages/ObjectStorage/DataLakes/Iceberg/Utils.h"
 
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h"
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/Utils.h"
-
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h"
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h"
+#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h"
+#include "Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h"
 
 namespace DB
 {
@@ -255,12 +235,15 @@ bool IcebergMetadata::update(const ContextPtr & local_context)
 
     const auto [metadata_version, metadata_file_path] = getMetadataFileAndVersion(object_storage, *configuration_ptr);
 
+    if (metadata_version == current_metadata_version)
+        return false;
+
+    current_metadata_version = metadata_version;
+
     auto metadata_object = readJson(metadata_file_path, local_context);
 
     chassert(format_version == metadata_object->getValue<int>("format-version"));
 
-    if (metadata_version == current_metadata_version)
-        return false;
 
     auto manifest_list_file = getRelevantManifestList(metadata_object);
     if (manifest_list_file && (!current_snapshot.has_value() || (manifest_list_file.value() != current_snapshot->getName())))
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index b4113b6dc26..c407f22dea0 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -3,24 +3,21 @@
 
 #if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format.
 
-#    include <Core/Types.h>
-#    include <Disks/ObjectStorages/IObjectStorage.h>
-#    include <Interpreters/Context_fwd.h>
-#    include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
-#    include <Storages/ObjectStorage/StorageObjectStorage.h>
+#include <Core/Types.h>
+#include <Disks/ObjectStorages/IObjectStorage.h>
+#include <Interpreters/Context_fwd.h>
+#include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
+#include <Storages/ObjectStorage/StorageObjectStorage.h>
 
-#    include <Poco/JSON/Array.h>
-#    include <Poco/JSON/Object.h>
-#    include <Poco/JSON/Parser.h>
+#include <Poco/JSON/Array.h>
+#include <Poco/JSON/Object.h>
+#include <Poco/JSON/Parser.h>
 
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h"
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h"
+#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
+#include "Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h"
+#include "Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h"
 
-#    include <memory>
-#    include <mutex>
-#    include <optional>
-#    include <unordered_map>
+#include <unordered_map>
 
 namespace DB
 {
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
index 1819d8bf599..a34422c72ca 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
@@ -3,26 +3,12 @@
 #if USE_AVRO
 
 
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h"
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/Utils.h"
+#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h"
+#include "Storages/ObjectStorage/DataLakes/Iceberg/Utils.h"
 
 
-#    include <Core/Types.h>
-#    include <Disks/ObjectStorages/IObjectStorage.h>
-#    include <Interpreters/Context_fwd.h>
-#    include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
-#    include <Storages/ObjectStorage/StorageObjectStorage.h>
-#    include <DataFile.hh>
-
-#    include <Poco/JSON/Array.h>
-#    include <Poco/JSON/Object.h>
-#    include <Poco/JSON/Parser.h>
-
-#    include <Common/Exception.h>
-#    include "DataTypes/DataTypeTuple.h"
-#    include "Formats/FormatSettings.h"
-
-#    include <Processors/Formats/Impl/AvroRowInputFormat.h>
+#include <Columns/ColumnTuple.h>
+#include "DataTypes/DataTypeTuple.h"
 
 
 namespace DB
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index 9dd98799917..57f013c55e5 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -1,11 +1,11 @@
 #pragma once
-#include <memory>
+
 #include "config.h"
 
 #if USE_AVRO
 
-#    include <cstdint>
-#    include <Common/Exception.h>
+#include <cstdint>
+#include <Common/Exception.h>
 
 
 namespace DB
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
index 75618a6a44b..668f6604317 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
@@ -2,28 +2,8 @@
 
 #if USE_AVRO
 
-
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/Utils.h"
-
-
-#    include <Core/Types.h>
-#    include <Disks/ObjectStorages/IObjectStorage.h>
-#    include <Interpreters/Context_fwd.h>
-#    include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
-#    include <Storages/ObjectStorage/StorageObjectStorage.h>
-#    include <DataFile.hh>
-
-#    include <Poco/JSON/Array.h>
-#    include <Poco/JSON/Object.h>
-#    include <Poco/JSON/Parser.h>
-
-#    include <Common/Exception.h>
-#    include "DataTypes/DataTypeTuple.h"
-#    include "Formats/FormatSettings.h"
-
-#    include <Processors/Formats/Impl/AvroRowInputFormat.h>
-
+#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
+#include <Processors/Formats/Impl/AvroRowInputFormat.h>
 
 namespace DB
 {
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
index 886626c6de0..1e2df7a4dbb 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
@@ -1,4 +1,4 @@
-#include "Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h"
+#include <Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h>
 
 #include <Poco/JSON/Array.h>
 #include <Poco/JSON/Object.h>
@@ -21,14 +21,18 @@
 #include <DataTypes/DataTypesNumber.h>
 #include <Formats/FormatFactory.h>
 
-namespace ErrorCodes
-{
-extern const int LOGICAL_ERROR;
-}
+#include <IO/ReadHelpers.h>
+
 
 namespace DB
 {
 
+namespace ErrorCodes
+{
+extern const int LOGICAL_ERROR;
+extern const int BAD_ARGUMENTS;
+extern const int UNSUPPORTED_METHOD;
+}
 namespace Iceberg
 {
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
index 755f8e940b4..657fd352da1 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
@@ -2,20 +2,19 @@
 
 #include <memory>
 #include <mutex>
-#include <unordered_map>
 #include "config.h"
 
 
+#include <Core/NamesAndTypes.h>
 #include <Core/Types.h>
-#include <Disks/ObjectStorages/IObjectStorage.h>
-#include <Interpreters/Context_fwd.h>
-#include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
-#include <Storages/ObjectStorage/StorageObjectStorage.h>
+#include <Interpreters/ActionsDAG.h>
+
 
 #include <Poco/JSON/Array.h>
 #include <Poco/JSON/Object.h>
 #include <Poco/JSON/Parser.h>
 
+#include <unordered_map>
 namespace DB
 {
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
index d75ebd6b2ab..36a141095b4 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
@@ -2,7 +2,7 @@
 #include "config.h"
 
 #if USE_AVRO
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
+#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
 
 namespace DB
 {
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
index c5b1e5f65c8..22e0912e590 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp
@@ -1,26 +1,10 @@
 
 #include "config.h"
 
-#if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format.
+#if USE_AVRO
 
-#    include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
-
-#    include <Core/Types.h>
-#    include <Disks/ObjectStorages/IObjectStorage.h>
-#    include <Interpreters/Context_fwd.h>
-#    include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
-#    include <Storages/ObjectStorage/StorageObjectStorage.h>
-#    include <DataFile.hh>
-
-#    include <Poco/JSON/Array.h>
-#    include <Poco/JSON/Object.h>
-#    include <Poco/JSON/Parser.h>
-
-#    include <Common/Exception.h>
-#    include "DataTypes/DataTypeTuple.h"
-#    include "Formats/FormatSettings.h"
-
-#    include <Processors/Formats/Impl/AvroRowInputFormat.h>
+#include <Processors/Formats/Impl/AvroRowInputFormat.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/Utils.h>
 
 namespace DB
 {
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
index 69515e50055..2e6161bd775 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h
@@ -2,24 +2,9 @@
 
 #include "config.h"
 
-#if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format.
+#if USE_AVRO
 
-
-#    include <Core/Types.h>
-#    include <Disks/ObjectStorages/IObjectStorage.h>
-#    include <Interpreters/Context_fwd.h>
-#    include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
-#    include <Storages/ObjectStorage/StorageObjectStorage.h>
-#    include <DataFile.hh>
-
-#    include <Poco/JSON/Array.h>
-#    include <Poco/JSON/Object.h>
-#    include <Poco/JSON/Parser.h>
-
-#    include <Common/Exception.h>
-#    include "Formats/FormatSettings.h"
-
-#    include <Processors/Formats/Impl/AvroRowInputFormat.h>
+#include <Processors/Formats/Impl/AvroRowInputFormat.h>
 
 namespace DB
 {

From 01ddf27e0fa3f6a888603ba0c7ae24ad894d5e63 Mon Sep 17 00:00:00 2001
From: divanik <dan.ivanik@clickhouse.com>
Date: Mon, 9 Dec 2024 12:13:46 +0000
Subject: [PATCH 29/41] Fix style check

---
 .../ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp      | 5 ++---
 .../ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h        | 2 +-
 .../ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp         | 3 +++
 .../ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h       | 5 -----
 4 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 94b646f30f5..7d346690b3d 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -33,7 +33,6 @@ namespace ErrorCodes
 extern const int FILE_DOESNT_EXIST;
 extern const int ILLEGAL_COLUMN;
 extern const int BAD_ARGUMENTS;
-extern const int UNSUPPORTED_METHOD;
 extern const int LOGICAL_ERROR;
 }
 
@@ -216,7 +215,7 @@ getMetadataFileAndVersion(const ObjectStoragePtr & object_storage, const Storage
     return *std::max_element(metadata_files_with_versions.begin(), metadata_files_with_versions.end());
 }
 
-Poco::JSON::Object::Ptr IcebergMetadata::readJson(const String & metadata_file_path, const ContextPtr & local_context) const
+Poco::JSON::Object::Ptr IcebergMetadata::readJSON(const String & metadata_file_path, const ContextPtr & local_context) const
 {
     StorageObjectStorageSource::ObjectInfo object_info(metadata_file_path);
     auto buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, local_context, log);
@@ -240,7 +239,7 @@ bool IcebergMetadata::update(const ContextPtr & local_context)
 
     current_metadata_version = metadata_version;
 
-    auto metadata_object = readJson(metadata_file_path, local_context);
+    auto metadata_object = readJSON(metadata_file_path, local_context);
 
     chassert(format_version == metadata_object->getValue<int>("format-version"));
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index c407f22dea0..ea159feec7e 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -130,7 +130,7 @@ private:
 
     std::optional<String> getRelevantManifestList(const Poco::JSON::Object::Ptr & metadata);
 
-    Poco::JSON::Object::Ptr readJson(const String & metadata_file_path, const ContextPtr & local_context) const;
+    Poco::JSON::Object::Ptr readJSON(const String & metadata_file_path, const ContextPtr & local_context) const;
 
     //Fields are needed only for providing dynamic polymorphism
     std::unordered_map<String, String> column_name_to_physical_name;
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
index a34422c72ca..4289ba6a54b 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp
@@ -1,3 +1,4 @@
+#include <unordered_set>
 #include "config.h"
 
 #if USE_AVRO
@@ -17,6 +18,8 @@ namespace DB
 namespace ErrorCodes
 {
 extern const int ILLEGAL_COLUMN;
+extern const int BAD_ARGUMENTS;
+extern const int UNSUPPORTED_METHOD;
 }
 
 namespace Iceberg
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
index 668f6604317..f9500e8994a 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
@@ -8,11 +8,6 @@
 namespace DB
 {
 
-namespace ErrorCodes
-{
-extern const int ILLEGAL_COLUMN;
-}
-
 namespace Iceberg
 {
 

From adaf1b30988330232bb929e44c4fc98beb8e497f Mon Sep 17 00:00:00 2001
From: divanik <dan.ivanik@clickhouse.com>
Date: Mon, 9 Dec 2024 12:23:21 +0000
Subject: [PATCH 30/41] Correct style

---
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 26 ------------------
 .../DataLakes/Iceberg/IcebergMetadata.h       |  2 +-
 .../DataLakes/Iceberg/ManifestFile.h          |  4 +--
 .../DataLakes/Iceberg/ManifestFileImpl.h      | 27 ++++++++++++++++++-
 .../DataLakes/Iceberg/Snapshot.h              |  2 --
 .../ObjectStorage/StorageObjectStorage.cpp    |  2 +-
 6 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 7d346690b3d..53370ab70c1 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -312,32 +312,6 @@ DataLakeMetadataPtr IcebergMetadata::create(
     return ptr;
 }
 
-/**
- * Manifest file has the following format: '/iceberg_data/db/table_name/metadata/c87bfec7-d36c-4075-ad04-600b6b0f2020-m0.avro'
- *
- * `manifest file` is different in format version V1 and V2 and has the following contents:
- *                        v1     v2
- * status                 req    req
- * snapshot_id            req    opt
- * sequence_number               opt
- * file_sequence_number          opt
- * data_file              req    req
- * Example format version V1:
- * ┌─status─┬─────────snapshot_id─┬─data_file───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
- * │      1 │ 2819310504515118887 │ ('/iceberg_data/db/table_name/data/00000-1-3edca534-15a0-4f74-8a28-4733e0bf1270-00001.parquet','PARQUET',(),100,1070,67108864,[(1,233),(2,210)],[(1,100),(2,100)],[(1,0),(2,0)],[],[(1,'\0'),(2,'0')],[(1,'c'),(2,'99')],NULL,[4],0) │
- * └────────┴─────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
- * Example format version V2:
- * ┌─status─┬─────────snapshot_id─┬─sequence_number─┬─file_sequence_number─┬─data_file───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
- * │      1 │ 5887006101709926452 │            ᴺᵁᴸᴸ │                 ᴺᵁᴸᴸ │ (0,'/iceberg_data/db/table_name/data/00000-1-c8045c90-8799-4eac-b957-79a0484e223c-00001.parquet','PARQUET',(),100,1070,[(1,233),(2,210)],[(1,100),(2,100)],[(1,0),(2,0)],[],[(1,'\0'),(2,'0')],[(1,'c'),(2,'99')],NULL,[4],[],0) │
- * └────────┴─────────────────────┴─────────────────┴──────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
- * In case of partitioned data we'll have extra directory partition=value:
- * ─status─┬─────────snapshot_id─┬─data_file──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
- * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=0/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00001.parquet','PARQUET',(0),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0\0'),(2,'1')],[(1,'\0\0\0\0\0\0\0\0'),(2,'1')],NULL,[4],0) │
- * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=1/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00002.parquet','PARQUET',(1),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'2')],[(1,'\0\0\0\0\0\0\0'),(2,'2')],NULL,[4],0) │
- * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=2/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00003.parquet','PARQUET',(2),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'3')],[(1,'\0\0\0\0\0\0\0'),(2,'3')],NULL,[4],0) │
- * └────────┴─────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
- */
-
 ManifestList IcebergMetadata::initializeManifestList(const String & manifest_list_file) const
 {
     auto configuration_ptr = configuration.lock();
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index ea159feec7e..4882b12c4a6 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -1,7 +1,7 @@
 #pragma once
 #include "config.h"
 
-#if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format.
+#if USE_AVRO
 
 #include <Core/Types.h>
 #include <Disks/ObjectStorages/IObjectStorage.h>
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index 57f013c55e5..504b789b484 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -7,13 +7,12 @@
 #include <cstdint>
 #include <Common/Exception.h>
 
-
 namespace DB
 {
 
-
 namespace Iceberg
 {
+
 class ManifestFileContentImpl;
 
 enum class ManifestEntryStatus : uint8_t
@@ -38,6 +37,7 @@ struct DataFileEntry
     DataFileContent content;
 };
 
+
 class ManifestFileContent
 {
 public:
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
index f9500e8994a..de6ddba4dd7 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
@@ -2,8 +2,8 @@
 
 #if USE_AVRO
 
-#include "Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h"
 #include <Processors/Formats/Impl/AvroRowInputFormat.h>
+#include <Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h>
 
 namespace DB
 {
@@ -11,6 +11,31 @@ namespace DB
 namespace Iceberg
 {
 
+/**
+ * Manifest file has the following format: '/iceberg_data/db/table_name/metadata/c87bfec7-d36c-4075-ad04-600b6b0f2020-m0.avro'
+ *
+ * `manifest file` is different in format version V1 and V2 and has the following contents:
+ *                        v1     v2
+ * status                 req    req
+ * snapshot_id            req    opt
+ * sequence_number               opt
+ * file_sequence_number          opt
+ * data_file              req    req
+ * Example format version V1:
+ * ┌─status─┬─────────snapshot_id─┬─data_file───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+ * │      1 │ 2819310504515118887 │ ('/iceberg_data/db/table_name/data/00000-1-3edca534-15a0-4f74-8a28-4733e0bf1270-00001.parquet','PARQUET',(),100,1070,67108864,[(1,233),(2,210)],[(1,100),(2,100)],[(1,0),(2,0)],[],[(1,'\0'),(2,'0')],[(1,'c'),(2,'99')],NULL,[4],0) │
+ * └────────┴─────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+ * Example format version V2:
+ * ┌─status─┬─────────snapshot_id─┬─sequence_number─┬─file_sequence_number─┬─data_file───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+ * │      1 │ 5887006101709926452 │            ᴺᵁᴸᴸ │                 ᴺᵁᴸᴸ │ (0,'/iceberg_data/db/table_name/data/00000-1-c8045c90-8799-4eac-b957-79a0484e223c-00001.parquet','PARQUET',(),100,1070,[(1,233),(2,210)],[(1,100),(2,100)],[(1,0),(2,0)],[],[(1,'\0'),(2,'0')],[(1,'c'),(2,'99')],NULL,[4],[],0) │
+ * └────────┴─────────────────────┴─────────────────┴──────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+ * In case of partitioned data we'll have extra directory partition=value:
+ * ─status─┬─────────snapshot_id─┬─data_file──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+ * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=0/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00001.parquet','PARQUET',(0),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0\0'),(2,'1')],[(1,'\0\0\0\0\0\0\0\0'),(2,'1')],NULL,[4],0) │
+ * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=1/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00002.parquet','PARQUET',(1),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'2')],[(1,'\0\0\0\0\0\0\0'),(2,'2')],NULL,[4],0) │
+ * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=2/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00003.parquet','PARQUET',(2),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'3')],[(1,'\0\0\0\0\0\0\0'),(2,'3')],NULL,[4],0) │
+ * └────────┴─────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+ */
 class ManifestFileContentImpl
 {
 public:
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
index 36a141095b4..49ce97d54c3 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h
@@ -10,7 +10,6 @@ namespace DB
 namespace Iceberg
 {
 
-
 class ManifestList
 {
 public:
@@ -33,7 +32,6 @@ public:
 
 
 private:
-    // Int32 snapshot_id;
     ManifestListsByName::const_iterator reference;
 };
 
diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
index b3295f4177a..e2e63ceeb46 100644
--- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp
+++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp
@@ -1,5 +1,5 @@
-    #include <Storages/ObjectStorage/StorageObjectStorage.h>
 #include <Core/ColumnWithTypeAndName.h>
+#include <Storages/ObjectStorage/StorageObjectStorage.h>
 
 #include <Core/Settings.h>
 #include <Formats/FormatFactory.h>

From bfb11979914f8886c7d5b5e3293c12074122e409 Mon Sep 17 00:00:00 2001
From: divanik <dan.ivanik@clickhouse.com>
Date: Mon, 9 Dec 2024 12:37:00 +0000
Subject: [PATCH 31/41] Fix style check

---
 src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
index de6ddba4dd7..002ef1434b0 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "config.h"
 
 #if USE_AVRO

From d4bab00fcc0c5fdf5d4c86c4b75cac1b2a62ed4d Mon Sep 17 00:00:00 2001
From: divanik <dan.ivanik@clickhouse.com>
Date: Mon, 9 Dec 2024 13:07:15 +0000
Subject: [PATCH 32/41] Add data files cache

---
 .../DataLakes/Iceberg/IcebergMetadata.cpp          | 14 ++++++++++----
 .../DataLakes/Iceberg/IcebergMetadata.h            |  2 ++
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 53370ab70c1..3e2fac4dcaa 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -248,6 +248,7 @@ bool IcebergMetadata::update(const ContextPtr & local_context)
     if (manifest_list_file && (!current_snapshot.has_value() || (manifest_list_file.value() != current_snapshot->getName())))
     {
         current_snapshot = getSnapshot(manifest_list_file.value());
+        cached_files_for_current_snapshot = std::nullopt;
     }
     current_schema_id = parseTableSchema(metadata_object, schema_processor, log);
     return true;
@@ -396,13 +397,17 @@ IcebergSnapshot IcebergMetadata::getSnapshot(const String & manifest_list_file)
 
 Strings IcebergMetadata::getDataFiles() const
 {
-    Strings data_files;
-
     if (!current_snapshot)
     {
         return {};
     }
 
+    if (cached_files_for_current_snapshot.has_value())
+    {
+        return cached_files_for_current_snapshot.value();
+    }
+
+    Strings data_files;
     for (const auto & manifest_entry : current_snapshot->getManifestList().getManifestFiles())
     {
         for (const auto & data_file : manifest_entry.getContent().getDataFiles())
@@ -414,9 +419,10 @@ Strings IcebergMetadata::getDataFiles() const
         }
     }
 
-    return data_files;
-}
+    cached_files_for_current_snapshot.emplace(std::move(data_files));
 
+    return cached_files_for_current_snapshot.value();
+}
 }
 
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index 4882b12c4a6..0913610a8bb 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -107,6 +107,8 @@ private:
     Int32 current_schema_id;
     std::optional<IcebergSnapshot> current_snapshot;
 
+    mutable std::optional<Strings> cached_files_for_current_snapshot;
+
     ManifestList initializeManifestList(const String & manifest_list_file) const;
 
     IcebergSnapshot getSnapshot(const String & manifest_list_file) const;

From 5ff54b030893e01ab0e66bdbcf0a149717a573cf Mon Sep 17 00:00:00 2001
From: divanik <dan.ivanik@clickhouse.com>
Date: Mon, 9 Dec 2024 13:38:06 +0000
Subject: [PATCH 33/41] Fix ifdefs issue

---
 .../ObjectStorage/DataLakes/DataLakeConfiguration.h   | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
index 29bd7b992d5..13b4d301daf 100644
--- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
+++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h
@@ -162,20 +162,21 @@ private:
     }
 };
 
-using IcebergMetadata = Iceberg::IcebergMetadata;
 
 #if USE_AVRO
+using IcebergMetadata = Iceberg::IcebergMetadata;
+
 #if USE_AWS_S3
 using StorageS3IcebergConfiguration = DataLakeConfiguration<StorageS3Configuration, IcebergMetadata>;
-#    endif
+#endif
 
 #if USE_AZURE_BLOB_STORAGE
 using StorageAzureIcebergConfiguration = DataLakeConfiguration<StorageAzureConfiguration, IcebergMetadata>;
-#    endif
+#endif
 
 #if USE_HDFS
 using StorageHDFSIcebergConfiguration = DataLakeConfiguration<StorageHDFSConfiguration, IcebergMetadata>;
-#    endif
+#endif
 
 using StorageLocalIcebergConfiguration = DataLakeConfiguration<StorageLocalConfiguration, IcebergMetadata>;
 #endif
@@ -183,7 +184,7 @@ using StorageLocalIcebergConfiguration = DataLakeConfiguration<StorageLocalConfi
 #if USE_PARQUET
 #if USE_AWS_S3
 using StorageS3DeltaLakeConfiguration = DataLakeConfiguration<StorageS3Configuration, DeltaLakeMetadata>;
-#    endif
+#endif
 #endif
 
 #if USE_AWS_S3

From 27fcf30eb0254e2fb85d934ffbc14fe453751981 Mon Sep 17 00:00:00 2001
From: divanik <dan.ivanik@clickhouse.com>
Date: Mon, 9 Dec 2024 16:31:11 +0000
Subject: [PATCH 34/41] Fix clang tidy issue

---
 src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index 3e2fac4dcaa..cda428b34f3 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -260,7 +260,6 @@ std::optional<String> IcebergMetadata::getRelevantManifestList(const Poco::JSON:
 
     auto snapshots = metadata->get("snapshots").extract<Poco::JSON::Array::Ptr>();
 
-    String manifest_list_file;
     auto current_snapshot_id = metadata->getValue<Int64>("current-snapshot-id");
 
     LOG_DEBUG(&Poco::Logger::get("IcebergMetadata initialize"), "Current snapshot id {}", current_snapshot_id);

From aa043940c7f6991bc048d3ea19d9b070aba32118 Mon Sep 17 00:00:00 2001
From: divanik <dan.ivanik@clickhouse.com>
Date: Tue, 10 Dec 2024 10:07:14 +0000
Subject: [PATCH 35/41] Resolve issues

---
 .../DataLakes/Iceberg/IcebergMetadata.cpp     | 25 ++++++++++---------
 .../DataLakes/Iceberg/IcebergMetadata.h       | 16 +++---------
 .../DataLakes/Iceberg/ManifestFile.h          |  2 +-
 .../DataLakes/Iceberg/ManifestFileImpl.h      |  2 +-
 .../DataLakes/Iceberg/SchemaProcessor.cpp     |  5 ----
 .../DataLakes/Iceberg/SchemaProcessor.h       |  2 +-
 6 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
index cda428b34f3..a92ae7f346b 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp
@@ -262,15 +262,9 @@ std::optional<String> IcebergMetadata::getRelevantManifestList(const Poco::JSON:
 
     auto current_snapshot_id = metadata->getValue<Int64>("current-snapshot-id");
 
-    LOG_DEBUG(&Poco::Logger::get("IcebergMetadata initialize"), "Current snapshot id {}", current_snapshot_id);
-
     for (size_t i = 0; i < snapshots->size(); ++i)
     {
         const auto snapshot = snapshots->getObject(static_cast<UInt32>(i));
-        LOG_DEBUG(
-            &Poco::Logger::get("IcebergMetadata initialize"),
-            "Iterationg on snapshot with id {}",
-            snapshot->getValue<Int64>("snapshot-id"));
 
         if (snapshot->getValue<Int64>("snapshot-id") == current_snapshot_id)
         {
@@ -281,6 +275,19 @@ std::optional<String> IcebergMetadata::getRelevantManifestList(const Poco::JSON:
     return std::nullopt;
 }
 
+std::optional<Int32> IcebergMetadata::getSchemaVersionByFileIfOutdated(String data_path) const
+{
+    auto manifest_file_it = manifest_entry_by_data_file.find(data_path);
+    if (manifest_file_it == manifest_entry_by_data_file.end())
+    {
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot find schema version for data file: {}", data_path);
+    }
+    auto schema_id = manifest_file_it->second.getContent().getSchemaId();
+    if (schema_id == current_schema_id)
+        return std::nullopt;
+    return std::optional{schema_id};
+}
+
 
 DataLakeMetadataPtr IcebergMetadata::create(
     const ObjectStoragePtr & object_storage, const ConfigurationObserverPtr & configuration, const ContextPtr & local_context)
@@ -290,7 +297,6 @@ DataLakeMetadataPtr IcebergMetadata::create(
     const auto [metadata_version, metadata_file_path] = getMetadataFileAndVersion(object_storage, *configuration_ptr);
 
     auto log = getLogger("IcebergMetadata");
-    LOG_DEBUG(log, "Parse metadata {}", metadata_file_path);
 
     StorageObjectStorageSource::ObjectInfo object_info(metadata_file_path);
     auto buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, local_context, log);
@@ -323,12 +329,9 @@ ManifestList IcebergMetadata::initializeManifestList(const String & manifest_lis
         std::filesystem::path(configuration_ptr->getPath()) / "metadata" / manifest_list_file);
     auto manifest_list_buf = StorageObjectStorageSource::createReadBuffer(object_info, object_storage, context, log);
 
-    LOG_DEBUG(&Poco::Logger::get("initializeManifestList"), "Parse manifest list {}", manifest_list_file);
     auto manifest_list_file_reader
         = std::make_unique<avro::DataFileReaderBase>(std::make_unique<AvroInputStreamReadBufferAdapter>(*manifest_list_buf));
 
-    LOG_DEBUG(&Poco::Logger::get("initializeManifestList"), "Parsed manifest list {}", manifest_list_file);
-
     auto data_type = AvroSchemaReader::avroNodeToDataType(manifest_list_file_reader->dataSchema().root()->leafAt(0));
     Block header{{data_type->createColumn(), data_type, "manifest_path"}};
     auto columns = parseAvro(*manifest_list_file_reader, header, getFormatSettings(context));
@@ -378,8 +381,6 @@ ManifestFileEntry IcebergMetadata::initializeManifestFile(const String & filenam
     {
         manifest_entry_by_data_file.emplace(data_file.data_file_name, manifest_file_entry);
     }
-    LOG_DEBUG(&Poco::Logger::get("IcebergMetadata"), "Added manifest file {}", manifest_file);
-
     schema_processor.addIcebergTableSchema(schema_object);
     return manifest_file_entry;
 }
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index 0913610a8bb..816857126da 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -49,7 +49,8 @@ public:
 
 
     /// Get data files. On first request it reads manifest_list file and iterates through manifest files to find all data files.
-    /// All subsequent calls will return saved list of files (because it cannot be changed without changing metadata file)
+    /// All subsequent calls when the same data snapshot is relevant will return saved list of files (because it cannot be changed
+    /// without changing metadata file). Drops on every snapshot update.
     Strings getDataFiles() const override;
 
     /// Get table schema parsed from metadata.
@@ -113,18 +114,7 @@ private:
 
     IcebergSnapshot getSnapshot(const String & manifest_list_file) const;
 
-    std::optional<Int32> getSchemaVersionByFileIfOutdated(String data_path) const
-    {
-        auto manifest_file_it = manifest_entry_by_data_file.find(data_path);
-        if (manifest_file_it == manifest_entry_by_data_file.end())
-        {
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot find schema version for data file: {}", data_path);
-        }
-        auto schema_id = manifest_file_it->second.getContent().getSchemaId();
-        if (schema_id == current_schema_id)
-            return std::nullopt;
-        return std::optional{schema_id};
-    }
+    std::optional<Int32> getSchemaVersionByFileIfOutdated(String data_path) const;
 
     ManifestFileEntry getManifestFile(const String & manifest_file) const;
 
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
index 504b789b484..cb434a4734f 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h
@@ -13,7 +13,7 @@ namespace DB
 namespace Iceberg
 {
 
-class ManifestFileContentImpl;
+struct ManifestFileContentImpl;
 
 enum class ManifestEntryStatus : uint8_t
 {
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
index 002ef1434b0..d2b474afa77 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFileImpl.h
@@ -38,7 +38,7 @@ namespace Iceberg
  * │      1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=2/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00003.parquet','PARQUET',(2),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'3')],[(1,'\0\0\0\0\0\0\0'),(2,'3')],NULL,[4],0) │
  * └────────┴─────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
  */
-class ManifestFileContentImpl
+struct ManifestFileContentImpl
 {
 public:
     explicit ManifestFileContentImpl(
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
index 1e2df7a4dbb..7943809bee3 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp
@@ -57,11 +57,6 @@ bool operator==(const Poco::JSON::Object & first, const Poco::JSON::Object & sec
     return first_string_stream.str() == second_string_stream.str();
 }
 
-// bool operator!=(const Poco::JSON::Object & first, const Poco::JSON::Object & second)
-// {
-//     return !(first == second);
-// }
-
 std::pair<size_t, size_t> parseDecimal(const String & type_name)
 {
     DB::ReadBufferFromString buf(std::string_view(type_name.begin() + 8, type_name.end() - 1));
diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
index 657fd352da1..f7d4c3d7e96 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h
@@ -23,7 +23,7 @@ namespace Iceberg
 
 
 /**
- * Iceberg supports the next data types (see https://iceberg.apache.org/spec/#schemas-and-data-types):
+ * Iceberg supports the following data types (see https://iceberg.apache.org/spec/#schemas-and-data-types):
  * - Primitive types:
  *   - boolean
  *   - int

From 05ca6d83a13973f6a6f4f9dffde22216a0ee925e Mon Sep 17 00:00:00 2001
From: divanik <dan.ivanik@clickhouse.com>
Date: Tue, 10 Dec 2024 11:16:52 +0000
Subject: [PATCH 36/41] Style fix

---
 .../ObjectStorage/DataLakes/IcebergMetadata.h | 238 ++++++++++++++++++
 1 file changed, 238 insertions(+)
 create mode 100644 src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h

diff --git a/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h
new file mode 100644
index 00000000000..f89fd0a2257
--- /dev/null
+++ b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h
@@ -0,0 +1,238 @@
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <unordered_map>
+#include "config.h"
+
+#if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format.
+
+#    include <Core/Types.h>
+#    include <Disks/ObjectStorages/IObjectStorage.h>
+#    include <Interpreters/Context_fwd.h>
+#    include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
+#    include <Storages/ObjectStorage/StorageObjectStorage.h>
+
+#    include <Poco/JSON/Array.h>
+#    include <Poco/JSON/Object.h>
+#    include <Poco/JSON/Parser.h>
+
+namespace DB
+{
+
+/**
+ * Iceberg supports the next data types (see https://iceberg.apache.org/spec/#schemas-and-data-types):
+ * - Primitive types:
+ *   - boolean
+ *   - int
+ *   - long
+ *   - float
+ *   - double
+ *   - decimal(P, S)
+ *   - date
+ *   - time (time of day in microseconds since midnight)
+ *   - timestamp (in microseconds since 1970-01-01)
+ *   - timestamptz (timestamp with timezone, stores values in UTC timezone)
+ *   - string
+ *   - uuid
+ *   - fixed(L) (fixed-length byte array of length L)
+ *   - binary
+ * - Complex types:
+ *   - struct(field1: Type1, field2: Type2, ...) (tuple of typed values)
+ *   - list(nested_type)
+ *   - map(Key, Value)
+ *
+ * Example of table schema in metadata:
+ * {
+ *     "type" : "struct",
+ *     "schema-id" : 0,
+ *     "fields" : [
+ *     {
+ *         "id" : 1,
+ *         "name" : "id",
+ *         "required" : false,
+ *         "type" : "long"
+ *     },
+ *     {
+ *         "id" : 2,
+ *         "name" : "array",
+ *         "required" : false,
+ *         "type" : {
+ *             "type" : "list",
+ *             "element-id" : 5,
+ *             "element" : "int",
+ *             "element-required" : false
+ *     },
+ *     {
+ *         "id" : 3,
+ *         "name" : "data",
+ *         "required" : false,
+ *         "type" : "binary"
+ *     }
+ * }
+ */
+class IcebergSchemaProcessor
+{
+    using Node = ActionsDAG::Node;
+
+public:
+    void addIcebergTableSchema(Poco::JSON::Object::Ptr schema_ptr);
+    std::shared_ptr<NamesAndTypesList> getClickhouseTableSchemaById(Int32 id);
+    std::shared_ptr<const ActionsDAG> getSchemaTransformationDagByIds(Int32 old_id, Int32 new_id);
+
+private:
+    std::unordered_map<Int32, Poco::JSON::Object::Ptr> iceberg_table_schemas_by_ids;
+    std::unordered_map<Int32, std::shared_ptr<NamesAndTypesList>> clickhouse_table_schemas_by_ids;
+    std::map<std::pair<Int32, Int32>, std::shared_ptr<ActionsDAG>> transform_dags_by_ids;
+
+    NamesAndTypesList getSchemaType(const Poco::JSON::Object::Ptr & schema);
+    DataTypePtr getComplexTypeFromObject(const Poco::JSON::Object::Ptr & type);
+    DataTypePtr getFieldType(const Poco::JSON::Object::Ptr & field, const String & type_key, bool required);
+    DataTypePtr getSimpleType(const String & type_name);
+
+    bool allowPrimitiveTypeConversion(const String & old_type, const String & new_type);
+    const Node * getDefaultNodeForField(const Poco::JSON::Object::Ptr & field);
+
+    std::shared_ptr<ActionsDAG> getSchemaTransformationDag(
+        const Poco::JSON::Object::Ptr & old_schema, const Poco::JSON::Object::Ptr & new_schema, Int32 old_id, Int32 new_id);
+
+    std::mutex mutex;
+};
+
+
+/**
+ * Useful links:
+ * - https://iceberg.apache.org/spec/
+ *
+ * Iceberg has two format versions, v1 and v2. The content of metadata files depends on the version.
+ *
+ * Unlike DeltaLake, Iceberg has several metadata layers: `table metadata`, `manifest list` and `manifest_files`.
+ * Metadata file - json file.
+ * Manifest list – an Avro file that lists manifest files; one per snapshot.
+ * Manifest file – an Avro file that lists data or delete files; a subset of a snapshot.
+ * All changes to table state create a new metadata file and replace the old metadata with an atomic swap.
+ *
+ * In order to find out which data files to read, we need to find the `manifest list`
+ * which corresponds to the latest snapshot. We find it by checking a list of snapshots
+ * in metadata's "snapshots" section.
+ *
+ * Example of metadata.json file.
+ * {
+ *     "format-version" : 1,
+ *     "table-uuid" : "ca2965ad-aae2-4813-8cf7-2c394e0c10f5",
+ *     "location" : "/iceberg_data/db/table_name",
+ *     "last-updated-ms" : 1680206743150,
+ *     "last-column-id" : 2,
+ *     "schema" : { "type" : "struct", "schema-id" : 0, "fields" : [ {<field1_info>}, {<field2_info>}, ... ] },
+ *     "current-schema-id" : 0,
+ *     "schemas" : [ ],
+ *     ...
+ *     "current-snapshot-id" : 2819310504515118887,
+ *     "refs" : { "main" : { "snapshot-id" : 2819310504515118887, "type" : "branch" } },
+ *     "snapshots" : [ {
+ *       "snapshot-id" : 2819310504515118887,
+ *       "timestamp-ms" : 1680206743150,
+ *       "summary" : {
+ *         "operation" : "append", "spark.app.id" : "local-1680206733239",
+ *         "added-data-files" : "1", "added-records" : "100",
+ *         "added-files-size" : "1070", "changed-partition-count" : "1",
+ *         "total-records" : "100", "total-files-size" : "1070", "total-data-files" : "1", "total-delete-files" : "0",
+ *         "total-position-deletes" : "0", "total-equality-deletes" : "0"
+ *       },
+ *       "manifest-list" : "/iceberg_data/db/table_name/metadata/snap-2819310504515118887-1-c87bfec7-d36c-4075-ad04-600b6b0f2020.avro",
+ *       "schema-id" : 0
+ *     } ],
+ *     "statistics" : [ ],
+ *     "snapshot-log" : [ ... ],
+ *     "metadata-log" : [ ]
+ * }
+ */
+class IcebergMetadata : public IDataLakeMetadata, private WithContext
+{
+public:
+    using ConfigurationObserverPtr = StorageObjectStorage::ConfigurationObserverPtr;
+
+    static constexpr auto name = "Iceberg";
+
+    IcebergMetadata(
+        ObjectStoragePtr object_storage_,
+        ConfigurationObserverPtr configuration_,
+        const DB::ContextPtr & context_,
+        Int32 metadata_version_,
+        Int32 format_version_,
+        String manifest_list_file_,
+        const Poco::JSON::Object::Ptr & object);
+
+    /// Get data files. On first request it reads manifest_list file and iterates through manifest files to find all data files.
+    /// All subsequent calls will return saved list of files (because it cannot be changed without changing metadata file)
+    Strings getDataFiles() const override;
+
+    /// Get table schema parsed from metadata.
+    NamesAndTypesList getTableSchema() const override { return schema; }
+
+    const std::unordered_map<String, String> & getColumnNameToPhysicalNameMapping() const override { return column_name_to_physical_name; }
+
+    const DataLakePartitionColumns & getPartitionColumns() const override { return partition_columns; }
+
+    bool operator==(const IDataLakeMetadata & other) const override
+    {
+        const auto * iceberg_metadata = dynamic_cast<const IcebergMetadata *>(&other);
+        return iceberg_metadata && getVersion() == iceberg_metadata->getVersion();
+    }
+
+    static DataLakeMetadataPtr
+    create(const ObjectStoragePtr & object_storage, const ConfigurationObserverPtr & configuration, const ContextPtr & local_context);
+
+    size_t getVersion() const { return metadata_version; }
+
+    std::shared_ptr<NamesAndTypesList> getInitialSchemaByPath(const String & data_path) const override
+    {
+        auto version_if_outdated = getSchemaVersionByFileIfOutdated(data_path);
+        return version_if_outdated.has_value() ? schema_processor.getClickhouseTableSchemaById(version_if_outdated.value()) : nullptr;
+    }
+
+    std::shared_ptr<const ActionsDAG> getSchemaTransformer(const String & data_path) const override
+    {
+        auto version_if_outdated = getSchemaVersionByFileIfOutdated(data_path);
+        return version_if_outdated.has_value()
+            ? schema_processor.getSchemaTransformationDagByIds(version_if_outdated.value(), current_schema_id)
+            : nullptr;
+    }
+
+    bool supportsExternalMetadataChange() const override { return true; }
+
+private:
+    mutable std::unordered_map<String, Int32> schema_id_by_data_file;
+
+    const ObjectStoragePtr object_storage;
+    const ConfigurationObserverPtr configuration;
+    Int32 metadata_version;
+    Int32 format_version;
+    String manifest_list_file;
+    Int32 current_schema_id;
+    mutable Strings data_files;
+    std::unordered_map<String, String> column_name_to_physical_name;
+    DataLakePartitionColumns partition_columns;
+    NamesAndTypesList schema;
+    mutable IcebergSchemaProcessor schema_processor;
+    LoggerPtr log;
+
+    mutable std::mutex get_data_files_mutex;
+
+    std::optional<Int32> getSchemaVersionByFileIfOutdated(String data_path) const
+    {
+        auto schema_id = schema_id_by_data_file.find(data_path);
+        if (schema_id == schema_id_by_data_file.end())
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot find schema version for data file: {}", data_path);
+        }
+        if (schema_id->second == current_schema_id)
+            return std::nullopt;
+        return std::optional{schema_id->second};
+    }
+};
+
+}
+
+#endif

From 088152febd88f6fc225718b013092cba487f98de Mon Sep 17 00:00:00 2001
From: divanik <dan.ivanik@clickhouse.com>
Date: Tue, 10 Dec 2024 12:33:36 +0000
Subject: [PATCH 37/41] Style check

---
 .../DataLakes/Iceberg/IcebergMetadata.h       |   6 -
 .../ObjectStorage/DataLakes/IcebergMetadata.h | 238 ------------------
 2 files changed, 244 deletions(-)
 delete mode 100644 src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h

diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
index 816857126da..f7765f4ece5 100644
--- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
+++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h
@@ -21,12 +21,6 @@
 
 namespace DB
 {
-
-namespace ErrorCodes
-{
-extern const int BAD_ARGUMENTS;
-}
-
 namespace Iceberg
 {
 
diff --git a/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h
deleted file mode 100644
index f89fd0a2257..00000000000
--- a/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h
+++ /dev/null
@@ -1,238 +0,0 @@
-#pragma once
-
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <unordered_map>
-#include "config.h"
-
-#if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format.
-
-#    include <Core/Types.h>
-#    include <Disks/ObjectStorages/IObjectStorage.h>
-#    include <Interpreters/Context_fwd.h>
-#    include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
-#    include <Storages/ObjectStorage/StorageObjectStorage.h>
-
-#    include <Poco/JSON/Array.h>
-#    include <Poco/JSON/Object.h>
-#    include <Poco/JSON/Parser.h>
-
-namespace DB
-{
-
-/**
- * Iceberg supports the next data types (see https://iceberg.apache.org/spec/#schemas-and-data-types):
- * - Primitive types:
- *   - boolean
- *   - int
- *   - long
- *   - float
- *   - double
- *   - decimal(P, S)
- *   - date
- *   - time (time of day in microseconds since midnight)
- *   - timestamp (in microseconds since 1970-01-01)
- *   - timestamptz (timestamp with timezone, stores values in UTC timezone)
- *   - string
- *   - uuid
- *   - fixed(L) (fixed-length byte array of length L)
- *   - binary
- * - Complex types:
- *   - struct(field1: Type1, field2: Type2, ...) (tuple of typed values)
- *   - list(nested_type)
- *   - map(Key, Value)
- *
- * Example of table schema in metadata:
- * {
- *     "type" : "struct",
- *     "schema-id" : 0,
- *     "fields" : [
- *     {
- *         "id" : 1,
- *         "name" : "id",
- *         "required" : false,
- *         "type" : "long"
- *     },
- *     {
- *         "id" : 2,
- *         "name" : "array",
- *         "required" : false,
- *         "type" : {
- *             "type" : "list",
- *             "element-id" : 5,
- *             "element" : "int",
- *             "element-required" : false
- *     },
- *     {
- *         "id" : 3,
- *         "name" : "data",
- *         "required" : false,
- *         "type" : "binary"
- *     }
- * }
- */
-class IcebergSchemaProcessor
-{
-    using Node = ActionsDAG::Node;
-
-public:
-    void addIcebergTableSchema(Poco::JSON::Object::Ptr schema_ptr);
-    std::shared_ptr<NamesAndTypesList> getClickhouseTableSchemaById(Int32 id);
-    std::shared_ptr<const ActionsDAG> getSchemaTransformationDagByIds(Int32 old_id, Int32 new_id);
-
-private:
-    std::unordered_map<Int32, Poco::JSON::Object::Ptr> iceberg_table_schemas_by_ids;
-    std::unordered_map<Int32, std::shared_ptr<NamesAndTypesList>> clickhouse_table_schemas_by_ids;
-    std::map<std::pair<Int32, Int32>, std::shared_ptr<ActionsDAG>> transform_dags_by_ids;
-
-    NamesAndTypesList getSchemaType(const Poco::JSON::Object::Ptr & schema);
-    DataTypePtr getComplexTypeFromObject(const Poco::JSON::Object::Ptr & type);
-    DataTypePtr getFieldType(const Poco::JSON::Object::Ptr & field, const String & type_key, bool required);
-    DataTypePtr getSimpleType(const String & type_name);
-
-    bool allowPrimitiveTypeConversion(const String & old_type, const String & new_type);
-    const Node * getDefaultNodeForField(const Poco::JSON::Object::Ptr & field);
-
-    std::shared_ptr<ActionsDAG> getSchemaTransformationDag(
-        const Poco::JSON::Object::Ptr & old_schema, const Poco::JSON::Object::Ptr & new_schema, Int32 old_id, Int32 new_id);
-
-    std::mutex mutex;
-};
-
-
-/**
- * Useful links:
- * - https://iceberg.apache.org/spec/
- *
- * Iceberg has two format versions, v1 and v2. The content of metadata files depends on the version.
- *
- * Unlike DeltaLake, Iceberg has several metadata layers: `table metadata`, `manifest list` and `manifest_files`.
- * Metadata file - json file.
- * Manifest list – an Avro file that lists manifest files; one per snapshot.
- * Manifest file – an Avro file that lists data or delete files; a subset of a snapshot.
- * All changes to table state create a new metadata file and replace the old metadata with an atomic swap.
- *
- * In order to find out which data files to read, we need to find the `manifest list`
- * which corresponds to the latest snapshot. We find it by checking a list of snapshots
- * in metadata's "snapshots" section.
- *
- * Example of metadata.json file.
- * {
- *     "format-version" : 1,
- *     "table-uuid" : "ca2965ad-aae2-4813-8cf7-2c394e0c10f5",
- *     "location" : "/iceberg_data/db/table_name",
- *     "last-updated-ms" : 1680206743150,
- *     "last-column-id" : 2,
- *     "schema" : { "type" : "struct", "schema-id" : 0, "fields" : [ {<field1_info>}, {<field2_info>}, ... ] },
- *     "current-schema-id" : 0,
- *     "schemas" : [ ],
- *     ...
- *     "current-snapshot-id" : 2819310504515118887,
- *     "refs" : { "main" : { "snapshot-id" : 2819310504515118887, "type" : "branch" } },
- *     "snapshots" : [ {
- *       "snapshot-id" : 2819310504515118887,
- *       "timestamp-ms" : 1680206743150,
- *       "summary" : {
- *         "operation" : "append", "spark.app.id" : "local-1680206733239",
- *         "added-data-files" : "1", "added-records" : "100",
- *         "added-files-size" : "1070", "changed-partition-count" : "1",
- *         "total-records" : "100", "total-files-size" : "1070", "total-data-files" : "1", "total-delete-files" : "0",
- *         "total-position-deletes" : "0", "total-equality-deletes" : "0"
- *       },
- *       "manifest-list" : "/iceberg_data/db/table_name/metadata/snap-2819310504515118887-1-c87bfec7-d36c-4075-ad04-600b6b0f2020.avro",
- *       "schema-id" : 0
- *     } ],
- *     "statistics" : [ ],
- *     "snapshot-log" : [ ... ],
- *     "metadata-log" : [ ]
- * }
- */
-class IcebergMetadata : public IDataLakeMetadata, private WithContext
-{
-public:
-    using ConfigurationObserverPtr = StorageObjectStorage::ConfigurationObserverPtr;
-
-    static constexpr auto name = "Iceberg";
-
-    IcebergMetadata(
-        ObjectStoragePtr object_storage_,
-        ConfigurationObserverPtr configuration_,
-        const DB::ContextPtr & context_,
-        Int32 metadata_version_,
-        Int32 format_version_,
-        String manifest_list_file_,
-        const Poco::JSON::Object::Ptr & object);
-
-    /// Get data files. On first request it reads manifest_list file and iterates through manifest files to find all data files.
-    /// All subsequent calls will return saved list of files (because it cannot be changed without changing metadata file)
-    Strings getDataFiles() const override;
-
-    /// Get table schema parsed from metadata.
-    NamesAndTypesList getTableSchema() const override { return schema; }
-
-    const std::unordered_map<String, String> & getColumnNameToPhysicalNameMapping() const override { return column_name_to_physical_name; }
-
-    const DataLakePartitionColumns & getPartitionColumns() const override { return partition_columns; }
-
-    bool operator==(const IDataLakeMetadata & other) const override
-    {
-        const auto * iceberg_metadata = dynamic_cast<const IcebergMetadata *>(&other);
-        return iceberg_metadata && getVersion() == iceberg_metadata->getVersion();
-    }
-
-    static DataLakeMetadataPtr
-    create(const ObjectStoragePtr & object_storage, const ConfigurationObserverPtr & configuration, const ContextPtr & local_context);
-
-    size_t getVersion() const { return metadata_version; }
-
-    std::shared_ptr<NamesAndTypesList> getInitialSchemaByPath(const String & data_path) const override
-    {
-        auto version_if_outdated = getSchemaVersionByFileIfOutdated(data_path);
-        return version_if_outdated.has_value() ? schema_processor.getClickhouseTableSchemaById(version_if_outdated.value()) : nullptr;
-    }
-
-    std::shared_ptr<const ActionsDAG> getSchemaTransformer(const String & data_path) const override
-    {
-        auto version_if_outdated = getSchemaVersionByFileIfOutdated(data_path);
-        return version_if_outdated.has_value()
-            ? schema_processor.getSchemaTransformationDagByIds(version_if_outdated.value(), current_schema_id)
-            : nullptr;
-    }
-
-    bool supportsExternalMetadataChange() const override { return true; }
-
-private:
-    mutable std::unordered_map<String, Int32> schema_id_by_data_file;
-
-    const ObjectStoragePtr object_storage;
-    const ConfigurationObserverPtr configuration;
-    Int32 metadata_version;
-    Int32 format_version;
-    String manifest_list_file;
-    Int32 current_schema_id;
-    mutable Strings data_files;
-    std::unordered_map<String, String> column_name_to_physical_name;
-    DataLakePartitionColumns partition_columns;
-    NamesAndTypesList schema;
-    mutable IcebergSchemaProcessor schema_processor;
-    LoggerPtr log;
-
-    mutable std::mutex get_data_files_mutex;
-
-    std::optional<Int32> getSchemaVersionByFileIfOutdated(String data_path) const
-    {
-        auto schema_id = schema_id_by_data_file.find(data_path);
-        if (schema_id == schema_id_by_data_file.end())
-        {
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot find schema version for data file: {}", data_path);
-        }
-        if (schema_id->second == current_schema_id)
-            return std::nullopt;
-        return std::optional{schema_id->second};
-    }
-};
-
-}
-
-#endif

From 7f2a8fca73dd64f416a51de34b537e0a0d16bd50 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 12 Dec 2024 15:39:09 +0100
Subject: [PATCH 38/41] Remove unused code

---
 src/Storages/MergeTree/IMergeTreeDataPart.cpp | 100 ------------------
 src/Storages/MergeTree/IMergeTreeDataPart.h   |  20 ----
 2 files changed, 120 deletions(-)

diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index f2494c64af5..43d310d1780 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -883,39 +883,6 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks
     }
 }
 
-void IMergeTreeDataPart::appendFilesOfColumnsChecksumsIndexes(Strings & files, bool include_projection) const
-{
-    if (isStoredOnDisk())
-    {
-        if (!isStoredOnReadonlyDisk())
-            appendFilesOfUUID(files);
-
-        appendFilesOfColumns(files);
-        appendFilesOfChecksums(files);
-        appendFilesOfIndexGranularity(files);
-        appendFilesOfIndex(files);
-        appendFilesOfRowsCount(files);
-        appendFilesOfPartitionAndMinMaxIndex(files);
-
-        if (!isStoredOnReadonlyDisk())
-            appendFilesOfTTLInfos(files);
-
-        appendFilesOfDefaultCompressionCodec(files);
-        appendFilesOfMetadataVersion(files);
-    }
-
-    if (!parent_part && include_projection)
-    {
-        for (const auto & [projection_name, projection_part] : projection_parts)
-        {
-            Strings projection_files;
-            projection_part->appendFilesOfColumnsChecksumsIndexes(projection_files, true);
-            for (const auto & projection_file : projection_files)
-                files.push_back(fs::path(projection_part->name + ".proj") / projection_file);
-        }
-    }
-}
-
 MergeTreeDataPartBuilder IMergeTreeDataPart::getProjectionPartBuilder(const String & projection_name, bool is_temp_projection)
 {
     const char * projection_extension = is_temp_projection ? ".tmp_proj" : ".proj";
@@ -994,10 +961,6 @@ void IMergeTreeDataPart::loadIndexGranularity()
                     "Method 'loadIndexGranularity' is not implemented for part with type {}", getType().toString());
 }
 
-/// Currently we don't cache mark files of part, because cache other meta files is enough to speed up loading.
-void IMergeTreeDataPart::appendFilesOfIndexGranularity(Strings & /* files */) const
-{
-}
 
 template <typename Columns>
 void IMergeTreeDataPart::optimizeIndexColumns(size_t marks_count, Columns & index_columns) const
@@ -1098,22 +1061,6 @@ std::shared_ptr<IMergeTreeDataPart::Index> IMergeTreeDataPart::loadIndex() const
     return std::make_shared<Index>(std::make_move_iterator(loaded_index.begin()), std::make_move_iterator(loaded_index.end()));
 }
 
-void IMergeTreeDataPart::appendFilesOfIndex(Strings & files) const
-{
-    auto metadata_snapshot = storage.getInMemoryMetadataPtr();
-    if (parent_part)
-        metadata_snapshot = metadata_snapshot->projections.has(name) ? metadata_snapshot->projections.get(name).metadata : nullptr;
-
-    if (!metadata_snapshot)
-        return;
-
-    if (metadata_snapshot->hasPrimaryKey())
-    {
-        String index_name = "primary" + getIndexExtensionFromFilesystem(getDataPartStorage());
-        files.push_back(index_name);
-    }
-}
-
 NameSet IMergeTreeDataPart::getFileNamesWithoutChecksums() const
 {
     if (!isStoredOnDisk())
@@ -1287,16 +1234,6 @@ void IMergeTreeDataPart::removeMetadataVersion()
     getDataPartStorage().removeFileIfExists(METADATA_VERSION_FILE_NAME);
 }
 
-void IMergeTreeDataPart::appendFilesOfDefaultCompressionCodec(Strings & files)
-{
-    files.push_back(DEFAULT_COMPRESSION_CODEC_FILE_NAME);
-}
-
-void IMergeTreeDataPart::appendFilesOfMetadataVersion(Strings & files)
-{
-    files.push_back(METADATA_VERSION_FILE_NAME);
-}
-
 CompressionCodecPtr IMergeTreeDataPart::detectDefaultCompressionCodec() const
 {
     /// In memory parts doesn't have any compression
@@ -1384,18 +1321,6 @@ void IMergeTreeDataPart::loadPartitionAndMinMaxIndex()
             getDataPartStorage().getFullPath(), calculated_partition_id, info.partition_id);
 }
 
-void IMergeTreeDataPart::appendFilesOfPartitionAndMinMaxIndex(Strings & files) const
-{
-    if (storage.format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING && !parent_part)
-        return;
-
-    if (!parent_part)
-        MergeTreePartition::appendFiles(storage, files);
-
-    if (!parent_part)
-        IMergeTreeDataPart::MinMaxIndex::appendFiles(storage, files);
-}
-
 void IMergeTreeDataPart::loadChecksums(bool require)
 {
     if (auto buf = metadata_manager->readIfExists("checksums.txt"))
@@ -1427,11 +1352,6 @@ void IMergeTreeDataPart::loadChecksums(bool require)
     }
 }
 
-void IMergeTreeDataPart::appendFilesOfChecksums(Strings & files)
-{
-    files.push_back("checksums.txt");
-}
-
 void IMergeTreeDataPart::loadRowsCountFileForUnexpectedPart()
 {
     if (storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING || part_type == Type::Compact || parent_part)
@@ -1655,11 +1575,6 @@ UInt64 IMergeTreeDataPart::readExistingRowsCount()
     return existing_count;
 }
 
-void IMergeTreeDataPart::appendFilesOfRowsCount(Strings & files)
-{
-    files.push_back("count.txt");
-}
-
 void IMergeTreeDataPart::loadTTLInfos()
 {
     if (auto in = metadata_manager->readIfExists("ttl.txt"))
@@ -1686,11 +1601,6 @@ void IMergeTreeDataPart::loadTTLInfos()
 }
 
 
-void IMergeTreeDataPart::appendFilesOfTTLInfos(Strings & files)
-{
-    files.push_back("ttl.txt");
-}
-
 void IMergeTreeDataPart::loadUUID()
 {
     if (auto in = metadata_manager->readIfExists(UUID_FILE_NAME))
@@ -1701,11 +1611,6 @@ void IMergeTreeDataPart::loadUUID()
     }
 }
 
-void IMergeTreeDataPart::appendFilesOfUUID(Strings & files)
-{
-    files.push_back(UUID_FILE_NAME);
-}
-
 void IMergeTreeDataPart::loadColumns(bool require)
 {
     String path = fs::path(getDataPartStorage().getRelativePath()) / "columns.txt";
@@ -1995,11 +1900,6 @@ bool IMergeTreeDataPart::assertHasValidVersionMetadata() const
     }
 }
 
-void IMergeTreeDataPart::appendFilesOfColumns(Strings & files)
-{
-    files.push_back("columns.txt");
-    files.push_back(SERIALIZATION_FILE_NAME);
-}
 
 bool IMergeTreeDataPart::shallParticipateInMerges(const StoragePolicyPtr & storage_policy) const
 {
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h
index ad20d5a5d0c..ec949699236 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@@ -179,7 +179,6 @@ public:
     /// Initialize columns (from columns.txt if exists, or create from column files if not).
     /// Load various metadata into memory: checksums from checksums.txt, index if required, etc.
     void loadColumnsChecksumsIndexes(bool require_columns_checksums, bool check_consistency);
-    void appendFilesOfColumnsChecksumsIndexes(Strings & files, bool include_projection = false) const;
 
     void loadRowsCountFileForUnexpectedPart();
 
@@ -700,20 +699,13 @@ private:
     /// Reads part unique identifier (if exists) from uuid.txt
     void loadUUID();
 
-    static void appendFilesOfUUID(Strings & files);
 
     /// Reads columns names and types from columns.txt
     void loadColumns(bool require);
 
-    static void appendFilesOfColumns(Strings & files);
-
-    static void appendFilesOfChecksums(Strings & files);
-
     /// Loads marks index granularity into memory
     virtual void loadIndexGranularity();
 
-    virtual void appendFilesOfIndexGranularity(Strings & files) const;
-
     /// Loads the index file.
     std::shared_ptr<Index> loadIndex() const;
 
@@ -721,8 +713,6 @@ private:
     template <typename Columns>
     void optimizeIndexColumns(size_t marks_count, Columns & index_columns) const;
 
-    void appendFilesOfIndex(Strings & files) const;
-
     /// Load rows count for this part from disk (for the newer storage format version).
     /// For the older format version calculates rows count from the size of a column with a fixed size.
     void loadRowsCount();
@@ -731,21 +721,15 @@ private:
     /// if load_existing_rows_count_for_old_parts and exclude_deleted_rows_for_part_size_in_merge are both enabled.
     void loadExistingRowsCount();
 
-    static void appendFilesOfRowsCount(Strings & files);
-
     /// Loads ttl infos in json format from file ttl.txt. If file doesn't exists assigns ttl infos with all zeros
     void loadTTLInfos();
 
-    static void appendFilesOfTTLInfos(Strings & files);
-
     void loadPartitionAndMinMaxIndex();
 
     void calculateColumnsSizesOnDisk(std::optional<Block> columns_sample = std::nullopt);
 
     void calculateSecondaryIndicesSizesOnDisk();
 
-    void appendFilesOfPartitionAndMinMaxIndex(Strings & files) const;
-
     /// Load default compression codec from file default_compression_codec.txt
     /// if it not exists tries to deduce codec from compressed column without
     /// any specifial compression.
@@ -757,10 +741,6 @@ private:
     template <typename Writer>
     void writeMetadata(const String & filename, const WriteSettings & settings, Writer && writer);
 
-    static void appendFilesOfDefaultCompressionCodec(Strings & files);
-
-    static void appendFilesOfMetadataVersion(Strings & files);
-
     /// Found column without specific compression and return codec
     /// for this column with default parameters.
     CompressionCodecPtr detectDefaultCompressionCodec() const;

From 5060c79b2ca5cd356ae3646d4c3897ac85d0638e Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Thu, 12 Dec 2024 21:40:44 +0000
Subject: [PATCH 39/41] Mark server setting use_legacy_mongodb_integration as
 obsolete

---
 src/Core/ServerSettings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Core/ServerSettings.cpp b/src/Core/ServerSettings.cpp
index 2169f238c18..379310d2d51 100644
--- a/src/Core/ServerSettings.cpp
+++ b/src/Core/ServerSettings.cpp
@@ -201,7 +201,7 @@ namespace DB
     DECLARE(UInt64, parts_kill_delay_period_random_add, 10, "Add uniformly distributed value from 0 to x seconds to kill_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables. Only available in ClickHouse Cloud", 0) \
     DECLARE(UInt64, parts_killer_pool_size, 128, "Threads for cleanup of shared merge tree outdated threads. Only available in ClickHouse Cloud", 0) \
     DECLARE(UInt64, keeper_multiread_batch_size, 10'000, "Maximum size of batch for MultiRead request to [Zoo]Keeper that support batching. If set to 0, batching is disabled. Available only in ClickHouse Cloud.", 0) \
-    DECLARE(Bool, use_legacy_mongodb_integration, true, "Obsolete, has no effect", 0) \
+    DECLARE(Bool, use_legacy_mongodb_integration, true, "Obsolete, does nothing.", SettingsTierType::OBSOLETE) \
     DECLARE(Bool, send_settings_to_client, true, "Send user settings from server configuration to clients (in the server Hello message).", 0) \
     \
     DECLARE(UInt64, prefetch_threadpool_pool_size, 100, "Size of background pool for prefetches for remote object storages", 0) \

From 14389a7c54546466814ffd9712c8359bb2c7aa1f Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Thu, 12 Dec 2024 22:53:40 +0000
Subject: [PATCH 40/41] Fix test

---
 tests/queries/0_stateless/02888_obsolete_settings.reference | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/queries/0_stateless/02888_obsolete_settings.reference b/tests/queries/0_stateless/02888_obsolete_settings.reference
index e2196860f4e..b8eec4794f7 100644
--- a/tests/queries/0_stateless/02888_obsolete_settings.reference
+++ b/tests/queries/0_stateless/02888_obsolete_settings.reference
@@ -1,4 +1,5 @@
 -- Obsolete server settings
+use_legacy_mongodb_integration
 -- Obsolete general settings
 1
 -- Obsolete merge tree settings

From 5f00c1e6146d5571aa8c87b0f61c502c2adaf76d Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Fri, 13 Dec 2024 10:58:00 +0000
Subject: [PATCH 41/41] Fix clang-tidy build

---
 src/Databases/Iceberg/RestCatalog.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/Databases/Iceberg/RestCatalog.cpp b/src/Databases/Iceberg/RestCatalog.cpp
index ab51bb8a495..078d9960ce4 100644
--- a/src/Databases/Iceberg/RestCatalog.cpp
+++ b/src/Databases/Iceberg/RestCatalog.cpp
@@ -53,7 +53,8 @@ std::pair<std::string, std::string> parseCatalogCredential(const std::string & c
     /// Parse a string of format "<client_id>:<client_secret>"
     /// into separare strings client_id and client_secret.
 
-    std::string client_id, client_secret;
+    std::string client_id;
+    std::string client_secret;
     if (!catalog_credential.empty())
     {
         auto pos = catalog_credential.find(':');
@@ -623,7 +624,9 @@ bool RestCatalog::getTableMetadataImpl(
                 static constexpr auto secret_access_key_str = "s3.secret-access-key";
                 static constexpr auto session_token_str = "s3.session-token";
 
-                std::string access_key_id, secret_access_key, session_token;
+                std::string access_key_id;
+                std::string secret_access_key;
+                std::string session_token;
                 if (config_object->has(access_key_id_str))
                     access_key_id = config_object->get(access_key_id_str).extract<String>();
                 if (config_object->has(secret_access_key_str))