Merge remote-tracking branch 'upstream/master' into named-collections-sql-commands

2024-09-20 00:30:49 +00:00 · 2022-11-23 12:00:55 +01:00 · 2022-11-23 12:00:55 +01:00 · 6044a9257c
commit 6044a9257c
parent 1073626f8e c81257c7e2
173 changed files with 4598 additions and 643 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -442,8 +442,9 @@ elseif (OS_DARWIN)
    include(cmake/darwin/default_libs.cmake)
 elseif (OS_FREEBSD)
    include(cmake/freebsd/default_libs.cmake)
+else()
+    link_libraries(global-group)
 endif ()
-link_libraries(global-group)

 if (NOT (OS_LINUX OR OS_DARWIN))
    # Using system libs can cause a lot of warnings in includes (on macro expansion).
@ -592,7 +593,7 @@ add_subdirectory (programs)
 add_subdirectory (tests)
 add_subdirectory (utils)

-include (cmake/sanitize_target_link_libraries.cmake)
+include (cmake/sanitize_targets.cmake)

 # Build native targets if necessary
 get_property(NATIVE_BUILD_TARGETS GLOBAL PROPERTY NATIVE_BUILD_TARGETS)
--- a/base/glibc-compatibility/glibc-compatibility.c
+++ b/base/glibc-compatibility/glibc-compatibility.c
@ -220,13 +220,13 @@ struct statx {
 	uint32_t stx_dev_minor;
 	uint64_t spare[14];
 };
-#endif

 int statx(int fd, const char *restrict path, int flag,
                 unsigned int mask, struct statx *restrict statxbuf)
 {
 	return syscall(SYS_statx, fd, path, flag, mask, statxbuf);
 }
+#endif


 #include <syscall.h>
--- a/cmake/darwin/default_libs.cmake
+++ b/cmake/darwin/default_libs.cmake
@ -23,6 +23,7 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)

 include (cmake/cxx.cmake)
+link_libraries(global-group)

 target_link_libraries(global-group INTERFACE
    $<TARGET_PROPERTY:global-libs,INTERFACE_LINK_LIBRARIES>
--- a/cmake/freebsd/default_libs.cmake
+++ b/cmake/freebsd/default_libs.cmake
@ -24,6 +24,7 @@ find_package(Threads REQUIRED)

 include (cmake/unwind.cmake)
 include (cmake/cxx.cmake)
+link_libraries(global-group)

 target_link_libraries(global-group INTERFACE
    $<TARGET_PROPERTY:global-libs,INTERFACE_LINK_LIBRARIES>
--- a/cmake/linux/default_libs.cmake
+++ b/cmake/linux/default_libs.cmake
@ -34,6 +34,13 @@ set(CMAKE_C_STANDARD_LIBRARIES ${DEFAULT_LIBS})
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)

+include (cmake/unwind.cmake)
+include (cmake/cxx.cmake)
+
+# Delay the call to link the global interface after the libc++ libraries are included to avoid circular dependencies
+# which are ok with static libraries but not with dynamic ones
+link_libraries(global-group)
+
 if (NOT OS_ANDROID)
    if (NOT USE_MUSL)
        # Our compatibility layer doesn't build under Android, many errors in musl.
@ -42,9 +49,6 @@ if (NOT OS_ANDROID)
    add_subdirectory(base/harmful)
 endif ()

-include (cmake/unwind.cmake)
-include (cmake/cxx.cmake)
-
 target_link_libraries(global-group INTERFACE
    -Wl,--start-group
    $<TARGET_PROPERTY:global-libs,INTERFACE_LINK_LIBRARIES>
--- a/cmake/sanitize_target_link_libraries.cmake
+++ b/cmake/sanitize_target_link_libraries.cmake
@ -1,3 +1,13 @@
+# https://stackoverflow.com/a/62311397/328260
+macro (get_all_targets_recursive targets dir)
+    get_property (subdirectories DIRECTORY ${dir} PROPERTY SUBDIRECTORIES)
+    foreach (subdir ${subdirectories})
+        get_all_targets_recursive (${targets} ${subdir})
+    endforeach ()
+    get_property (current_targets DIRECTORY ${dir} PROPERTY BUILDSYSTEM_TARGETS)
+    list (APPEND ${targets} ${current_targets})
+endmacro ()
+
 # When you will try to link target with the directory (that exists), cmake will
 # skip this without an error, only the following warning will be reported:
 #
@ -18,23 +28,12 @@
 #   -- but cannot be used with link_libraries()
 # - use BUILDSYSTEM_TARGETS property to get list of all targets and sanitize
 #   -- this will work.
-
-# https://stackoverflow.com/a/62311397/328260
 function (get_all_targets var)
    set (targets)
    get_all_targets_recursive (targets ${CMAKE_CURRENT_SOURCE_DIR})
    set (${var} ${targets} PARENT_SCOPE)
 endfunction()
-macro (get_all_targets_recursive targets dir)
-    get_property (subdirectories DIRECTORY ${dir} PROPERTY SUBDIRECTORIES)
-    foreach (subdir ${subdirectories})
-        get_all_targets_recursive (${targets} ${subdir})
-    endforeach ()
-    get_property (current_targets DIRECTORY ${dir} PROPERTY BUILDSYSTEM_TARGETS)
-    list (APPEND ${targets} ${current_targets})
-endmacro ()
-
-macro (sanitize_link_libraries target)
+function (sanitize_link_libraries target)
    get_target_property(target_type ${target} TYPE)
    if (${target_type} STREQUAL "INTERFACE_LIBRARY")
        get_property(linked_libraries TARGET ${target} PROPERTY INTERFACE_LINK_LIBRARIES)
@ -48,9 +47,35 @@ macro (sanitize_link_libraries target)
            message(FATAL_ERROR "${target} requested to link with directory: ${linked_library}")
        endif()
    endforeach()
-endmacro()
-
+endfunction()
 get_all_targets (all_targets)
 foreach (target ${all_targets})
    sanitize_link_libraries(${target})
 endforeach()
+
+#
+# Do not allow to define -W* from contrib publically (INTERFACE/PUBLIC).
+#
+function (get_contrib_targets var)
+    set (targets)
+    get_all_targets_recursive (targets ${CMAKE_CURRENT_SOURCE_DIR}/contrib)
+    set (${var} ${targets} PARENT_SCOPE)
+endfunction()
+function (sanitize_interface_flags target)
+    get_target_property(target_type ${target} TYPE)
+    get_property(compile_definitions TARGET ${target} PROPERTY INTERFACE_COMPILE_DEFINITIONS)
+    get_property(compile_options TARGET ${target} PROPERTY INTERFACE_COMPILE_OPTIONS)
+    if (NOT "${compile_options}" STREQUAL "")
+        message(FATAL_ERROR "${target} set INTERFACE_COMPILE_OPTIONS to ${compile_options}. This is forbidden.")
+    endif()
+    if ("${compile_definitions}" MATCHES "-Wl,")
+        # linker option - OK
+    elseif ("${compile_definitions}" MATCHES "-W")
+        message(FATAL_ERROR "${target} contains ${compile_definitions} flags in INTERFACE_COMPILE_DEFINITIONS. This is forbidden.")
+    endif()
+endfunction()
+get_contrib_targets (contrib_targets)
+foreach (contrib_target ${contrib_targets})
+    sanitize_interface_flags(${contrib_target})
+endforeach()
+
--- a/contrib/libcxx-cmake/CMakeLists.txt
+++ b/contrib/libcxx-cmake/CMakeLists.txt
@ -57,7 +57,7 @@ add_library(cxx ${SRCS})
 set_target_properties(cxx PROPERTIES FOLDER "contrib/libcxx-cmake")

 target_include_directories(cxx SYSTEM BEFORE PRIVATE $<BUILD_INTERFACE:${LIBCXX_SOURCE_DIR}/src>)
-target_include_directories(cxx SYSTEM BEFORE PUBLIC  $<BUILD_INTERFACE:${LIBCXX_SOURCE_DIR}/include>)
+target_include_directories(cxx SYSTEM BEFORE PUBLIC  $<$<COMPILE_LANGUAGE:CXX>:$<BUILD_INTERFACE:${LIBCXX_SOURCE_DIR}/include>>)
 target_compile_definitions(cxx PRIVATE -D_LIBCPP_BUILDING_LIBRARY -DLIBCXX_BUILDING_LIBCXXABI)

 # Enable capturing stack traces for all exceptions.
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -451,6 +451,7 @@ else
    # FIXME https://github.com/ClickHouse/ClickHouse/issues/39197 ("Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'")
    # NOTE  Incompatibility was introduced in https://github.com/ClickHouse/ClickHouse/pull/39263, it's expected
    #       ("This engine is deprecated and is not supported in transactions", "[Queue = DB::MergeMutateRuntimeQueue]: Code: 235. DB::Exception: Part")
+    # FIXME https://github.com/ClickHouse/ClickHouse/issues/39174 - bad mutation does not indicate backward incompatibility
    echo "Check for Error messages in server log:"
    zgrep -Fav -e "Code: 236. DB::Exception: Cancelled merging parts" \
               -e "Code: 236. DB::Exception: Cancelled mutating parts" \
@ -485,6 +486,7 @@ else
               -e "(ReplicatedMergeTreeAttachThread): Initialization failed. Error" \
               -e "Code: 269. DB::Exception: Destination table is myself" \
               -e "Coordination::Exception: Connection loss" \
+               -e "MutateFromLogEntryTask" \
        /var/log/clickhouse-server/clickhouse-server.backward.clean.log | zgrep -Fa "<Error>" > /test_output/bc_check_error_messages.txt \
        && echo -e 'Backward compatibility check: Error message in clickhouse-server.log (see bc_check_error_messages.txt)\tFAIL' >> /test_output/test_results.tsv \
        || echo -e 'Backward compatibility check: No Error messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@ -13,7 +13,7 @@ The supported formats are:
 | Format                                                                                    | Input | Output |
 |-------------------------------------------------------------------------------------------|------|--------|
 | [TabSeparated](#tabseparated)                                                             | ✔    | ✔      |
-| [TabSeparatedRaw](#tabseparatedraw)                                                       | ✔    | ✔      |
+| [TabSeparatedRaw](#tabseparatedraw)                                 | ✔    | ✔      |
 | [TabSeparatedWithNames](#tabseparatedwithnames)                                           | ✔    | ✔      |
 | [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes)                           | ✔    | ✔      |
 | [TabSeparatedRawWithNames](#tabseparatedrawwithnames)                                     | ✔    | ✔      |
@ -48,6 +48,7 @@ The supported formats are:
 | [JSONCompactStringsEachRowWithNames](#jsoncompactstringseachrowwithnames)                 | ✔    | ✔      |
 | [JSONCompactStringsEachRowWithNamesAndTypes](#jsoncompactstringseachrowwithnamesandtypes) | ✔    | ✔      |
 | [JSONObjectEachRow](#jsonobjecteachrow)                                                   | ✔    | ✔      |
+| [BSONEachRow](#bsoneachrow)                                                               | ✔    | ✔      |
 | [TSKV](#tskv)                                                                             | ✔    | ✔      |
 | [Pretty](#pretty)                                                                         | ✗    | ✔      |
 | [PrettyNoEscapes](#prettynoescapes)                                                       | ✗    | ✔      |
@ -1210,6 +1211,69 @@ SELECT * FROM json_each_row_nested
 - [output_format_json_array_of_rows](../operations/settings/settings.md#output_format_json_array_of_rows) - output a JSON array of all rows in JSONEachRow(Compact) format. Default value - `false`.
 - [output_format_json_validate_utf8](../operations/settings/settings.md#output_format_json_validate_utf8) - enables validation of UTF-8 sequences in JSON output formats (note that it doesn't impact formats JSON/JSONCompact/JSONColumnsWithMetadata, they always validate utf8). Default value - `false`.

+## BSONEachRow {#bsoneachrow}
+
+In this format, ClickHouse formats/parses data as a sequence of BSON documents without any separator between them.
+Each row is formatted as a single document and each column is formatted as a single BSON document field with column name as a key.
+
+For output it uses the following correspondence between ClickHouse types and BSON types:
+
+| ClickHouse type                                                                                           | BSON Type                                                                                                 |
+|-----------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------|
+| [Bool](../sql-reference/data-types/boolean.md)                                                            | `\x08` boolean                                                                                            |
+| [Int8/UInt8](../sql-reference/data-types/int-uint.md)                                                     | `\x10` int32                                                                                              |
+| [Int16UInt16](../sql-reference/data-types/int-uint.md)                                                    | `\x10` int32                                                                                              |
+| [Int32](../sql-reference/data-types/int-uint.md)                                                          | `\x10` int32                                                                                              |
+| [UInt32](../sql-reference/data-types/int-uint.md)                                                         | `\x12` int64                                                                                              |
+| [Int64/UInt64](../sql-reference/data-types/int-uint.md)                                                   | `\x12` int64                                                                                              |
+| [Float32/Float64](../sql-reference/data-types/float.md)                                                   | `\x01` double                                                                                             |
+| [Date](../sql-reference/data-types/date.md)/[Date32](../sql-reference/data-types/date32.md)               | `\x10` int32                                                                                              |
+| [DateTime](../sql-reference/data-types/datetime.md)                                                       | `\x12` int64                                                                                                |
+| [DateTime64](../sql-reference/data-types/datetime64.md)                                                   | `\x09` datetime                                                                                             |
+| [Decimal32](../sql-reference/data-types/decimal.md)                                                       | `\x10` int32                                                                                                |
+| [Decimal64](../sql-reference/data-types/decimal.md)                                                       | `\x12` int64                                                                                                |
+| [Decimal128](../sql-reference/data-types/decimal.md)                                                      | `\x05` binary, `\x00` binary subtype, size = 16                                                               |
+| [Decimal256](../sql-reference/data-types/decimal.md)                                                      | `\x05` binary, `\x00` binary subtype, size = 32                                                               |
+| [Int128/UInt128](../sql-reference/data-types/int-uint.md)                                                 | `\x05` binary, `\x00` binary subtype, size = 16                                                               |
+| [Int256/UInt256](../sql-reference/data-types/int-uint.md)                                                 | `\x05` binary, `\x00` binary subtype, size = 32                                                               |
+| [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md) | `\x05` binary, `\x00` binary subtype or \x02 string if setting output_format_bson_string_as_string is enabled |
+| [UUID](../sql-reference/data-types/uuid.md)                                                               | `\x05` binary, `\x04` uuid subtype, size = 16                                                                 |
+| [Array](../sql-reference/data-types/array.md)                                                             | `\x04` array                                                                                                |
+| [Tuple](../sql-reference/data-types/tuple.md)                                                             | `\x04` array                                                                                                |
+| [Named Tuple](../sql-reference/data-types/tuple.md)                                                       | `\x03` document                                                                                             |
+| [Map](../sql-reference/data-types/map.md) (with String keys)                                              | `\x03` document                                                                                             |
+
+For input it uses the following correspondence between BSON types and ClickHouse types:
+
+| BSON Type                                | ClickHouse Type                                                                                                                                              |
+|------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `\x01` double                            | [Float32/Float64](../sql-reference/data-types/float.md)                                                                                                      |
+| `\x02` string                            | [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md)                                                    |
+| `\x03` document                          | [Map](../sql-reference/data-types/map.md)/[Named Tuple](../sql-reference/data-types/tuple.md)                                                                |
+| `\x04` array                             | [Array](../sql-reference/data-types/array.md)/[Tuple](../sql-reference/data-types/tuple.md)                                                                  |
+| `\x05` binary, `\x00` binary subtype     | [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md)                                                    |
+| `\x05` binary, `\x02` old binary subtype | [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md)                                                    |
+| `\x05` binary, `\x03` old uuid subtype   | [UUID](../sql-reference/data-types/uuid.md)                                                                                                                  |
+| `\x05` binary, `\x04` uuid subtype       | [UUID](../sql-reference/data-types/uuid.md)                                                                                                                  |
+| `\x07` ObjectId                          | [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md)                                                    |
+| `\x08` boolean                           | [Bool](../sql-reference/data-types/boolean.md)                                                                                                               |
+| `\x09` datetime                          | [DateTime64](../sql-reference/data-types/datetime64.md)                                                                                                      |
+| `\x0A` null value                        | [NULL](../sql-reference/data-types/nullable.md)                                                                                                              |
+| `\x0D` JavaScript code                   | [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md)                                                    |
+| `\x0E` symbol                            | [String](../sql-reference/data-types/string.md)/[FixedString](../sql-reference/data-types/fixedstring.md)                                                    |
+| `\x10` int32                             | [Int32/UInt32](../sql-reference/data-types/int-uint.md)/[Decimal32](../sql-reference/data-types/decimal.md)                                                         |
+| `\x12` int64                             | [Int64/UInt64](../sql-reference/data-types/int-uint.md)/[Decimal64](../sql-reference/data-types/decimal.md)/[DateTime64](../sql-reference/data-types/datetime64.md) |
+
+Other BSON types are not supported. Also, it performs conversion between different integer types (for example, you can insert BSON int32 value into ClickHouse UInt8). 
+Big integers and decimals (Int128/UInt128/Int256/UInt256/Decimal128/Decimal256) can be parsed from BSON Binary value with `\x00` binary subtype. In this case this format will validate that the size of binary data equals the size of expected value.
+
+Note: this format don't work properly on Big-Endian platforms.
+
+### BSON format settings {#bson-format-settings}
+
+- [output_format_bson_string_as_string](../operations/settings/settings.md#output_format_bson_string_as_string) - use BSON String type instead of Binary for String columns. Default value - `false`.
+- [input_format_bson_skip_fields_with_unsupported_types_in_schema_inference](../operations/settings/settings.md#input_format_bson_skip_fields_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for format BSONEachRow. Default value - `false`.
+
 ## Native {#native}

 The most efficient format. Data is written and read by blocks in binary format. For each block, the number of rows, number of columns, column names and types, and parts of columns in this block are recorded one after another. In other words, this format is “columnar” – it does not convert columns to rows. This is the format used in the native interface for interaction between servers, for using the command-line client, and for C++ clients.
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -4784,7 +4784,7 @@ Possible values:

 Default value: 1.

-## SQLInsert format settings {$sqlinsert-format-settings}
+## SQLInsert format settings {#sqlinsert-format-settings}

 ### output_format_sql_insert_max_batch_size {#output_format_sql_insert_max_batch_size}

@ -4815,3 +4815,17 @@ Default value: `false`.
 Quote column names with "`" characters

 Default value: `true`.
+
+## BSONEachRow format settings {#bson-each-row-format-settings}
+
+### output_format_bson_string_as_string {#output_format_bson_string_as_string}
+
+Use BSON String type instead of Binary for String columns.
+
+Disabled by default.
+
+### input_format_bson_skip_fields_with_unsupported_types_in_schema_inference {#input_format_bson_skip_fields_with_unsupported_types_in_schema_inference}
+
+Allow skipping columns with unsupported types while schema inference for format BSONEachRow.
+
+Disabled by default.
--- a/docs/en/sql-reference/statements/explain.md
+++ b/docs/en/sql-reference/statements/explain.md
@ -47,6 +47,7 @@ Union

 -  `AST` — Abstract syntax tree.
 -  `SYNTAX` — Query text after AST-level optimizations.
+-  `QUERY TREE` — Query tree after Query Tree level optimizations.
 -  `PLAN` — Query execution plan.
 -  `PIPELINE` — Query execution pipeline.

@ -110,6 +111,32 @@ FROM
 CROSS JOIN system.numbers AS c
 ```

+### EXPLAIN QUERY TREE
+
+Settings:
+
+-   `run_passes` — Run all query tree passes before dumping the query tree. Defaul: `1`.
+-   `dump_passes` — Dump information about used passes before dumping the query tree. Default: `0`.
+-   `passes` — Specifies how many passes to run. If set to `-1`, runs all the passes. Default: `-1`.
+
+Example:
+```sql
+EXPLAIN QUERY TREE SELECT id, value FROM test_table;
+```
+
+```
+QUERY id: 0
+  PROJECTION COLUMNS
+    id UInt64
+    value String
+  PROJECTION
+    LIST id: 1, nodes: 2
+      COLUMN id: 2, column_name: id, result_type: UInt64, source_id: 3
+      COLUMN id: 4, column_name: value, result_type: String, source_id: 3
+  JOIN TREE
+    TABLE id: 3, table_name: default.test_table
+```
+
 ### EXPLAIN PLAN

 Dump query plan steps.
--- a/docs/en/sql-reference/statements/select/group-by.md
+++ b/docs/en/sql-reference/statements/select/group-by.md
@ -243,6 +243,54 @@ If `max_rows_to_group_by` and `group_by_overflow_mode = 'any'` are not used, all

 You can use `WITH TOTALS` in subqueries, including subqueries in the [JOIN](../../../sql-reference/statements/select/join.md) clause (in this case, the respective total values are combined).

+## GROUP BY ALL
+
+`GROUP BY ALL` is equivalent to listing all the SELECT-ed expressions that are not aggregate functions.
+
+For example:
+
+``` sql
+SELECT
+    a * 2,
+    b,
+    count(c),
+FROM t
+GROUP BY ALL
+```
+
+is the same as
+
+``` sql
+SELECT
+    a * 2,
+    b,
+    count(c),
+FROM t
+GROUP BY a * 2, b
+```
+
+For a special case that if there is a function having both aggregate functions and other fields as its arguments, the `GROUP BY` keys will contain the maximum non-aggregate fields we can extract from it.
+
+For example:
+
+``` sql
+SELECT
+    substring(a, 4, 2),
+    substring(substring(a, 1, 2), 1, count(b))
+FROM t
+GROUP BY ALL
+```
+
+is the same as
+
+``` sql
+SELECT
+    substring(a, 4, 2),
+    substring(substring(a, 1, 2), 1, count(b))
+FROM t
+GROUP BY substring(a, 4, 2), substring(a, 1, 2)
+```
+
 ## Examples

 Example:
--- a/docs/zh/sql-reference/statements/select/group-by.md
+++ b/docs/zh/sql-reference/statements/select/group-by.md
@ -77,6 +77,54 @@ sidebar_label: GROUP BY

 您可以使用 `WITH TOTALS` 在子查询中，包括在子查询 [JOIN](../../../sql-reference/statements/select/join.md) 子句（在这种情况下，将各自的总值合并）。

+## GROUP BY ALL {#group-by-all}
+
+`GROUP BY ALL` 相当于对所有被查询的并且不被聚合函数使用的字段进行`GROUP BY`。
+
+例如
+
+``` sql
+SELECT
+    a * 2,
+    b,
+    count(c),
+FROM t
+GROUP BY ALL
+```
+
+效果等同于
+
+``` sql
+SELECT
+    a * 2,
+    b,
+    count(c),
+FROM t
+GROUP BY a * 2, b
+```
+
+对于一种特殊情况，如果一个 function 的参数中同时有聚合函数和其他字段，会对参数中能提取的最大非聚合字段进行`GROUP BY`。
+
+例如:
+
+``` sql
+SELECT
+    substring(a, 4, 2),
+    substring(substring(a, 1, 2), 1, count(b))
+FROM t
+GROUP BY ALL
+```
+
+效果等同于
+
+``` sql
+SELECT
+    substring(a, 4, 2),
+    substring(substring(a, 1, 2), 1, count(b))
+FROM t
+GROUP BY substring(a, 4, 2), substring(a, 1, 2)
+```
+
 ## 例子 {#examples}

 示例:
--- a/packages/clickhouse-server.service
+++ b/packages/clickhouse-server.service
@ -9,7 +9,10 @@ After=time-sync.target network-online.target
 Wants=time-sync.target

 [Service]
-Type=simple
+Type=notify
+
+# Switching off watchdog is very important for sd_notify to work correctly.
+Environment=CLICKHOUSE_WATCHDOG_ENABLE=0
 User=clickhouse
 Group=clickhouse
 Restart=always
--- a/programs/keeper/Keeper.cpp
+++ b/programs/keeper/Keeper.cpp
@ -262,6 +262,7 @@ void Keeper::defineOptions(Poco::Util::OptionSet & options)
 }

 int Keeper::main(const std::vector<std::string> & /*args*/)
+try
 {
    Poco::Logger * log = &logger();

@ -473,6 +474,12 @@ int Keeper::main(const std::vector<std::string> & /*args*/)

    return Application::EXIT_OK;
 }
+catch (...)
+{
+    /// Poco does not provide stacktrace.
+    tryLogCurrentException("Application");
+    throw;
+}


 void Keeper::logRevision() const
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -99,6 +99,10 @@
 #include "config_version.h"

 #if defined(OS_LINUX)
+#    include <cstddef>
+#    include <cstdlib>
+#    include <sys/socket.h>
+#    include <sys/un.h>
 #    include <sys/mman.h>
 #    include <sys/ptrace.h>
 #    include <Common/hasLinuxCapability.h>
@ -273,6 +277,7 @@ namespace ErrorCodes
    extern const int MISMATCHING_USERS_FOR_PROCESS_AND_DATA;
    extern const int NETWORK_ERROR;
    extern const int CORRUPTED_DATA;
+    extern const int SYSTEM_ERROR;
 }


@ -646,7 +651,53 @@ static void sanityChecks(Server & server)
    }
 }

+#if defined(OS_LINUX)
+/// Sends notification to systemd, analogous to sd_notify from libsystemd
+static void systemdNotify(const std::string_view & command)
+{
+    const char * path = getenv("NOTIFY_SOCKET");  // NOLINT(concurrency-mt-unsafe)
+
+    if (path == nullptr)
+        return; /// not using systemd
+
+    int s = socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0);
+
+    if (s == -1)
+        throwFromErrno("Can't create UNIX socket for systemd notify.", ErrorCodes::SYSTEM_ERROR);
+
+    SCOPE_EXIT({ close(s); });
+
+    const size_t len = strlen(path);
+
+    struct sockaddr_un addr;
+
+    addr.sun_family = AF_UNIX;
+
+    if (len < 2 || len > sizeof(addr.sun_path) - 1)
+        throw Exception(ErrorCodes::SYSTEM_ERROR, "NOTIFY_SOCKET env var value \"{}\" is wrong.", path);
+
+    memcpy(addr.sun_path, path, len + 1); /// write last zero as well.
+
+    size_t addrlen = offsetof(struct sockaddr_un, sun_path) + len;
+
+    /// '@' meass this is Linux abstract socket, per documentation it must be sun_path[0] must be set to '\0' for it.
+    if (path[0] == '@')
+        addr.sun_path[0] = 0;
+    else if (path[0] == '/')
+        addrlen += 1; /// non-abstract-addresses should be zero terminated.
+    else
+        throw Exception(ErrorCodes::SYSTEM_ERROR, "Wrong UNIX path \"{}\" in NOTIFY_SOCKET env var", path);
+
+    const struct sockaddr *sock_addr = reinterpret_cast <const struct sockaddr *>(&addr);
+
+    if (sendto(s, command.data(), command.size(), 0, sock_addr, static_cast <socklen_t>(addrlen)) != static_cast <ssize_t>(command.size()))
+        throw Exception("Failed to notify systemd.", ErrorCodes::SYSTEM_ERROR);
+
+}
+#endif
+
 int Server::main(const std::vector<std::string> & /*args*/)
+try
 {
    Poco::Logger * log = &logger();

@ -1150,6 +1201,9 @@ int Server::main(const std::vector<std::string> & /*args*/)
            total_memory_tracker.setDescription("(total)");
            total_memory_tracker.setMetric(CurrentMetrics::MemoryTracking);

+            bool allow_use_jemalloc_memory = config->getBool("allow_use_jemalloc_memory", true);
+            total_memory_tracker.setAllowUseJemallocMemory(allow_use_jemalloc_memory);
+
            auto * global_overcommit_tracker = global_context->getGlobalOvercommitTracker();
            total_memory_tracker.setOvercommitTracker(global_overcommit_tracker);

@ -1779,6 +1833,10 @@ int Server::main(const std::vector<std::string> & /*args*/)
            tryLogCurrentException(log, "Caught exception while starting cluster discovery");
        }

+#if defined(OS_LINUX)
+        systemdNotify("READY=1\n");
+#endif
+
        SCOPE_EXIT_SAFE({
            LOG_DEBUG(log, "Received termination signal.");

@ -1848,6 +1906,12 @@ int Server::main(const std::vector<std::string> & /*args*/)

    return Application::EXIT_OK;
 }
+catch (...)
+{
+    /// Poco does not provide stacktrace.
+    tryLogCurrentException("Application");
+    throw;
+}

 std::unique_ptr<TCPProtocolStackFactory> Server::buildProtocolStackFromConfig(
    const Poco::Util::AbstractConfiguration & config,
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@ -133,8 +133,8 @@ enum class AccessType
    M(SHOW_ROW_POLICIES, "SHOW POLICIES, SHOW CREATE ROW POLICY, SHOW CREATE POLICY", TABLE, SHOW_ACCESS) \
    M(SHOW_QUOTAS, "SHOW CREATE QUOTA", GLOBAL, SHOW_ACCESS) \
    M(SHOW_SETTINGS_PROFILES, "SHOW PROFILES, SHOW CREATE SETTINGS PROFILE, SHOW CREATE PROFILE", GLOBAL, SHOW_ACCESS) \
-    M(SHOW_NAMED_COLLECTIONS, "SHOW NAMED COLLECTIONS", GLOBAL, SHOW_ACCESS) \
    M(SHOW_ACCESS, "", GROUP, ACCESS_MANAGEMENT) \
+    M(SHOW_NAMED_COLLECTIONS, "SHOW NAMED COLLECTIONS", GROUP, ACCESS_MANAGEMENT) \
    M(ACCESS_MANAGEMENT, "", GROUP, ALL) \
    \
    M(SYSTEM_SHUTDOWN, "SYSTEM KILL, SHUTDOWN", GLOBAL, SYSTEM) \
--- a/src/Access/ExternalAuthenticators.cpp
+++ b/src/Access/ExternalAuthenticators.cpp
@ -2,6 +2,8 @@
 #include <Access/LDAPClient.h>
 #include <Common/Exception.h>
 #include <Common/quoteString.h>
+#include <Common/SipHash.h>
+
 #include <Poco/Util/AbstractConfiguration.h>
 #include <boost/algorithm/string/case_conv.hpp>

@ -73,6 +75,7 @@ void parseLDAPServer(LDAPClient::Params & params, const Poco::Util::AbstractConf
    const bool has_tls_ca_cert_file = config.has(ldap_server_config + ".tls_ca_cert_file");
    const bool has_tls_ca_cert_dir = config.has(ldap_server_config + ".tls_ca_cert_dir");
    const bool has_tls_cipher_suite = config.has(ldap_server_config + ".tls_cipher_suite");
+    const bool has_search_limit = config.has(ldap_server_config + ".search_limit");

    if (!has_host)
        throw Exception("Missing 'host' entry", ErrorCodes::BAD_ARGUMENTS);
@ -91,8 +94,8 @@ void parseLDAPServer(LDAPClient::Params & params, const Poco::Util::AbstractConf
    }
    else if (has_auth_dn_prefix || has_auth_dn_suffix)
    {
-        const auto auth_dn_prefix = config.getString(ldap_server_config + ".auth_dn_prefix");
-        const auto auth_dn_suffix = config.getString(ldap_server_config + ".auth_dn_suffix");
+        std::string auth_dn_prefix = config.getString(ldap_server_config + ".auth_dn_prefix");
+        std::string auth_dn_suffix = config.getString(ldap_server_config + ".auth_dn_suffix");
        params.bind_dn = auth_dn_prefix + "{user_name}" + auth_dn_suffix;
    }

@ -176,14 +179,17 @@ void parseLDAPServer(LDAPClient::Params & params, const Poco::Util::AbstractConf

    if (has_port)
    {
-        const auto port = config.getInt64(ldap_server_config + ".port");
-        if (port < 0 || port > 65535)
+        UInt32 port = config.getUInt(ldap_server_config + ".port");
+        if (port > 65535)
            throw Exception("Bad value for 'port' entry", ErrorCodes::BAD_ARGUMENTS);

        params.port = port;
    }
    else
        params.port = (params.enable_tls == LDAPClient::Params::TLSEnable::YES ? 636 : 389);
+
+    if (has_search_limit)
+        params.search_limit = static_cast<UInt32>(config.getUInt64(ldap_server_config + ".search_limit"));
 }

 void parseKerberosParams(GSSAcceptorContext::Params & params, const Poco::Util::AbstractConfiguration & config)
@ -313,11 +319,26 @@ void ExternalAuthenticators::setConfiguration(const Poco::Util::AbstractConfigur
    }
 }

+UInt128 computeParamsHash(const LDAPClient::Params & params, const LDAPClient::RoleSearchParamsList * role_search_params)
+{
+    SipHash hash;
+    params.updateHash(hash);
+    if (role_search_params)
+    {
+        for (const auto & params_instance : *role_search_params)
+        {
+            params_instance.updateHash(hash);
+        }
+    }
+
+    return hash.get128();
+}
+
 bool ExternalAuthenticators::checkLDAPCredentials(const String & server, const BasicCredentials & credentials,
    const LDAPClient::RoleSearchParamsList * role_search_params, LDAPClient::SearchResultsList * role_search_results) const
 {
    std::optional<LDAPClient::Params> params;
-    std::size_t params_hash = 0;
+    UInt128 params_hash = 0;

    {
        std::scoped_lock lock(mutex);
@ -331,14 +352,7 @@ bool ExternalAuthenticators::checkLDAPCredentials(const String & server, const B
        params->user = credentials.getUserName();
        params->password = credentials.getPassword();

-        params->combineCoreHash(params_hash);
-        if (role_search_params)
-        {
-            for (const auto & params_instance : *role_search_params)
-            {
-                params_instance.combineHash(params_hash);
-            }
-        }
+        params_hash = computeParamsHash(*params, role_search_params);

        // Check the cache, but only if the caching is enabled at all.
        if (params->verification_cooldown > std::chrono::seconds{0})
@ -408,15 +422,7 @@ bool ExternalAuthenticators::checkLDAPCredentials(const String & server, const B
        new_params.user = credentials.getUserName();
        new_params.password = credentials.getPassword();

-        std::size_t new_params_hash = 0;
-        new_params.combineCoreHash(new_params_hash);
-        if (role_search_params)
-        {
-            for (const auto & params_instance : *role_search_params)
-            {
-                params_instance.combineHash(new_params_hash);
-            }
-        }
+        const UInt128 new_params_hash = computeParamsHash(new_params, role_search_params);

        // If the critical server params have changed while we were checking the password, we discard the current result.
        if (params_hash != new_params_hash)
--- a/src/Access/LDAPClient.cpp
+++ b/src/Access/LDAPClient.cpp
@ -2,10 +2,10 @@
 #include <Common/Exception.h>
 #include <base/scope_guard.h>
 #include <Common/logger_useful.h>
+#include <Common/SipHash.h>

 #include <Poco/Logger.h>
 #include <boost/algorithm/string/predicate.hpp>
-#include <boost/container_hash/hash.hpp>

 #include <mutex>
 #include <utility>
@ -15,6 +15,22 @@

 #include <sys/time.h>

+namespace
+{
+
+template <typename T, typename = std::enable_if_t<std::is_fundamental_v<std::decay_t<T>>>>
+void updateHash(SipHash & hash, const T & value)
+{
+    hash.update(value);
+}
+
+void updateHash(SipHash & hash, const std::string & value)
+{
+    hash.update(value.size());
+    hash.update(value);
+}
+
+}

 namespace DB
 {
@ -26,30 +42,30 @@ namespace ErrorCodes
    extern const int LDAP_ERROR;
 }

-void LDAPClient::SearchParams::combineHash(std::size_t & seed) const
+void LDAPClient::SearchParams::updateHash(SipHash & hash) const
 {
-    boost::hash_combine(seed, base_dn);
-    boost::hash_combine(seed, static_cast<int>(scope));
-    boost::hash_combine(seed, search_filter);
-    boost::hash_combine(seed, attribute);
+    ::updateHash(hash, base_dn);
+    ::updateHash(hash, static_cast<int>(scope));
+    ::updateHash(hash, search_filter);
+    ::updateHash(hash, attribute);
 }

-void LDAPClient::RoleSearchParams::combineHash(std::size_t & seed) const
+void LDAPClient::RoleSearchParams::updateHash(SipHash & hash) const
 {
-    SearchParams::combineHash(seed);
-    boost::hash_combine(seed, prefix);
+    SearchParams::updateHash(hash);
+    ::updateHash(hash, prefix);
 }

-void LDAPClient::Params::combineCoreHash(std::size_t & seed) const
+void LDAPClient::Params::updateHash(SipHash & hash) const
 {
-    boost::hash_combine(seed, host);
-    boost::hash_combine(seed, port);
-    boost::hash_combine(seed, bind_dn);
-    boost::hash_combine(seed, user);
-    boost::hash_combine(seed, password);
+    ::updateHash(hash, host);
+    ::updateHash(hash, port);
+    ::updateHash(hash, bind_dn);
+    ::updateHash(hash, user);
+    ::updateHash(hash, password);

    if (user_dn_detection)
-        user_dn_detection->combineHash(seed);
+        user_dn_detection->updateHash(hash);
 }

 LDAPClient::LDAPClient(const Params & params_)
@ -153,13 +169,13 @@ namespace

 }

-void LDAPClient::diag(int rc, String text)
+void LDAPClient::handleError(int result_code, String text)
 {
    std::scoped_lock lock(ldap_global_mutex);

-    if (rc != LDAP_SUCCESS)
+    if (result_code != LDAP_SUCCESS)
    {
-        const char * raw_err_str = ldap_err2string(rc);
+        const char * raw_err_str = ldap_err2string(result_code);
        if (raw_err_str && *raw_err_str != '\0')
        {
            if (!text.empty())
@ -214,7 +230,7 @@ bool LDAPClient::openConnection()

        SCOPE_EXIT({ ldap_memfree(uri); });

-        diag(ldap_initialize(&handle, uri));
+        handleError(ldap_initialize(&handle, uri));
        if (!handle)
            throw Exception("ldap_initialize() failed", ErrorCodes::LDAP_ERROR);
    }
@ -226,13 +242,13 @@ bool LDAPClient::openConnection()
            case LDAPClient::Params::ProtocolVersion::V2: value = LDAP_VERSION2; break;
            case LDAPClient::Params::ProtocolVersion::V3: value = LDAP_VERSION3; break;
        }
-        diag(ldap_set_option(handle, LDAP_OPT_PROTOCOL_VERSION, &value));
+        handleError(ldap_set_option(handle, LDAP_OPT_PROTOCOL_VERSION, &value));
    }

-    diag(ldap_set_option(handle, LDAP_OPT_RESTART, LDAP_OPT_ON));
+    handleError(ldap_set_option(handle, LDAP_OPT_RESTART, LDAP_OPT_ON));

 #ifdef LDAP_OPT_KEEPCONN
-    diag(ldap_set_option(handle, LDAP_OPT_KEEPCONN, LDAP_OPT_ON));
+    handleError(ldap_set_option(handle, LDAP_OPT_KEEPCONN, LDAP_OPT_ON));
 #endif

 #ifdef LDAP_OPT_TIMEOUT
@ -240,7 +256,7 @@ bool LDAPClient::openConnection()
        ::timeval operation_timeout;
        operation_timeout.tv_sec = params.operation_timeout.count();
        operation_timeout.tv_usec = 0;
-        diag(ldap_set_option(handle, LDAP_OPT_TIMEOUT, &operation_timeout));
+        handleError(ldap_set_option(handle, LDAP_OPT_TIMEOUT, &operation_timeout));
    }
 #endif

@ -249,18 +265,18 @@ bool LDAPClient::openConnection()
        ::timeval network_timeout;
        network_timeout.tv_sec = params.network_timeout.count();
        network_timeout.tv_usec = 0;
-        diag(ldap_set_option(handle, LDAP_OPT_NETWORK_TIMEOUT, &network_timeout));
+        handleError(ldap_set_option(handle, LDAP_OPT_NETWORK_TIMEOUT, &network_timeout));
    }
 #endif

    {
        const int search_timeout = static_cast<int>(params.search_timeout.count());
-        diag(ldap_set_option(handle, LDAP_OPT_TIMELIMIT, &search_timeout));
+        handleError(ldap_set_option(handle, LDAP_OPT_TIMELIMIT, &search_timeout));
    }

    {
-        const int size_limit = params.search_limit;
-        diag(ldap_set_option(handle, LDAP_OPT_SIZELIMIT, &size_limit));
+        const int size_limit = static_cast<int>(params.search_limit);
+        handleError(ldap_set_option(handle, LDAP_OPT_SIZELIMIT, &size_limit));
    }

 #ifdef LDAP_OPT_X_TLS_PROTOCOL_MIN
@ -274,7 +290,7 @@ bool LDAPClient::openConnection()
            case LDAPClient::Params::TLSProtocolVersion::TLS1_1: value = LDAP_OPT_X_TLS_PROTOCOL_TLS1_1; break;
            case LDAPClient::Params::TLSProtocolVersion::TLS1_2: value = LDAP_OPT_X_TLS_PROTOCOL_TLS1_2; break;
        }
-        diag(ldap_set_option(handle, LDAP_OPT_X_TLS_PROTOCOL_MIN, &value));
+        handleError(ldap_set_option(handle, LDAP_OPT_X_TLS_PROTOCOL_MIN, &value));
    }
 #endif

@ -288,44 +304,44 @@ bool LDAPClient::openConnection()
            case LDAPClient::Params::TLSRequireCert::TRY:    value = LDAP_OPT_X_TLS_TRY;    break;
            case LDAPClient::Params::TLSRequireCert::DEMAND: value = LDAP_OPT_X_TLS_DEMAND; break;
        }
-        diag(ldap_set_option(handle, LDAP_OPT_X_TLS_REQUIRE_CERT, &value));
+        handleError(ldap_set_option(handle, LDAP_OPT_X_TLS_REQUIRE_CERT, &value));
    }
 #endif

 #ifdef LDAP_OPT_X_TLS_CERTFILE
    if (!params.tls_cert_file.empty())
-        diag(ldap_set_option(handle, LDAP_OPT_X_TLS_CERTFILE, params.tls_cert_file.c_str()));
+        handleError(ldap_set_option(handle, LDAP_OPT_X_TLS_CERTFILE, params.tls_cert_file.c_str()));
 #endif

 #ifdef LDAP_OPT_X_TLS_KEYFILE
    if (!params.tls_key_file.empty())
-        diag(ldap_set_option(handle, LDAP_OPT_X_TLS_KEYFILE, params.tls_key_file.c_str()));
+        handleError(ldap_set_option(handle, LDAP_OPT_X_TLS_KEYFILE, params.tls_key_file.c_str()));
 #endif

 #ifdef LDAP_OPT_X_TLS_CACERTFILE
    if (!params.tls_ca_cert_file.empty())
-        diag(ldap_set_option(handle, LDAP_OPT_X_TLS_CACERTFILE, params.tls_ca_cert_file.c_str()));
+        handleError(ldap_set_option(handle, LDAP_OPT_X_TLS_CACERTFILE, params.tls_ca_cert_file.c_str()));
 #endif

 #ifdef LDAP_OPT_X_TLS_CACERTDIR
    if (!params.tls_ca_cert_dir.empty())
-        diag(ldap_set_option(handle, LDAP_OPT_X_TLS_CACERTDIR, params.tls_ca_cert_dir.c_str()));
+        handleError(ldap_set_option(handle, LDAP_OPT_X_TLS_CACERTDIR, params.tls_ca_cert_dir.c_str()));
 #endif

 #ifdef LDAP_OPT_X_TLS_CIPHER_SUITE
    if (!params.tls_cipher_suite.empty())
-        diag(ldap_set_option(handle, LDAP_OPT_X_TLS_CIPHER_SUITE, params.tls_cipher_suite.c_str()));
+        handleError(ldap_set_option(handle, LDAP_OPT_X_TLS_CIPHER_SUITE, params.tls_cipher_suite.c_str()));
 #endif

 #ifdef LDAP_OPT_X_TLS_NEWCTX
    {
        const int i_am_a_server = 0;
-        diag(ldap_set_option(handle, LDAP_OPT_X_TLS_NEWCTX, &i_am_a_server));
+        handleError(ldap_set_option(handle, LDAP_OPT_X_TLS_NEWCTX, &i_am_a_server));
    }
 #endif

    if (params.enable_tls == LDAPClient::Params::TLSEnable::YES_STARTTLS)
-        diag(ldap_start_tls_s(handle, nullptr, nullptr));
+        handleError(ldap_start_tls_s(handle, nullptr, nullptr));

    final_user_name = escapeForDN(params.user);
    final_bind_dn = replacePlaceholders(params.bind_dn, { {"{user_name}", final_user_name} });
@ -346,7 +362,7 @@ bool LDAPClient::openConnection()
                if (rc == LDAP_INVALID_CREDENTIALS)
                    return false;

-                diag(rc);
+                handleError(rc);
            }

            // Once bound, run the user DN search query and update the default value, if asked.
@ -425,7 +441,7 @@ LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params)
        }
    });

-    diag(ldap_search_ext_s(handle, final_base_dn.c_str(), scope, final_search_filter.c_str(), attrs, 0, nullptr, nullptr, &timeout, params.search_limit, &msgs));
+    handleError(ldap_search_ext_s(handle, final_base_dn.c_str(), scope, final_search_filter.c_str(), attrs, 0, nullptr, nullptr, &timeout, params.search_limit, &msgs));

    for (
         auto * msg = ldap_first_message(handle, msgs);
@ -452,7 +468,7 @@ LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params)

                    ::berval bv;

-                    diag(ldap_get_dn_ber(handle, msg, &ber, &bv));
+                    handleError(ldap_get_dn_ber(handle, msg, &ber, &bv));

                    if (bv.bv_val && bv.bv_len > 0)
                        result.emplace(bv.bv_val, bv.bv_len);
@ -504,7 +520,7 @@ LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params)
            case LDAP_RES_SEARCH_REFERENCE:
            {
                char ** referrals = nullptr;
-                diag(ldap_parse_reference(handle, msg, &referrals, nullptr, 0));
+                handleError(ldap_parse_reference(handle, msg, &referrals, nullptr, 0));

                if (referrals)
                {
@ -528,7 +544,7 @@ LDAPClient::SearchResults LDAPClient::search(const SearchParams & search_params)
                char * matched_msg = nullptr;
                char * error_msg = nullptr;

-                diag(ldap_parse_result(handle, msg, &rc, &matched_msg, &error_msg, nullptr, nullptr, 0));
+                handleError(ldap_parse_result(handle, msg, &rc, &matched_msg, &error_msg, nullptr, nullptr, 0));

                if (rc != LDAP_SUCCESS)
                {
@ -610,7 +626,7 @@ bool LDAPSimpleAuthClient::authenticate(const RoleSearchParamsList * role_search

 #else // USE_LDAP

-void LDAPClient::diag(const int, String)
+void LDAPClient::handleError(const int, String)
 {
    throw Exception("ClickHouse was built without LDAP support", ErrorCodes::FEATURE_IS_NOT_ENABLED_AT_BUILD_TIME);
 }
--- a/src/Access/LDAPClient.h
+++ b/src/Access/LDAPClient.h
@ -16,6 +16,7 @@
 #include <set>
 #include <vector>

+class SipHash;

 namespace DB
 {
@ -38,7 +39,7 @@ public:
        String search_filter;
        String attribute = "cn";

-        void combineHash(std::size_t & seed) const;
+        void updateHash(SipHash & hash) const;
    };

    struct RoleSearchParams
@ -46,7 +47,7 @@ public:
    {
        String prefix;

-        void combineHash(std::size_t & seed) const;
+        void updateHash(SipHash & hash) const;
    };

    using RoleSearchParamsList = std::vector<RoleSearchParams>;
@ -95,7 +96,7 @@ public:
        ProtocolVersion protocol_version = ProtocolVersion::V3;

        String host;
-        std::uint16_t port = 636;
+        UInt16 port = 636;

        TLSEnable enable_tls = TLSEnable::YES;
        TLSProtocolVersion tls_minimum_protocol_version = TLSProtocolVersion::TLS1_2;
@ -119,9 +120,9 @@ public:
        std::chrono::seconds operation_timeout{40};
        std::chrono::seconds network_timeout{30};
        std::chrono::seconds search_timeout{20};
-        std::uint32_t search_limit = 100;
+        UInt32 search_limit = 256; /// An arbitrary number, no particular motivation for this value.

-        void combineCoreHash(std::size_t & seed) const;
+        void updateHash(SipHash & hash) const;
    };

    explicit LDAPClient(const Params & params_);
@ -133,7 +134,7 @@ public:
    LDAPClient & operator= (LDAPClient &&) = delete;

 protected:
-    MAYBE_NORETURN void diag(int rc, String text = "");
+    MAYBE_NORETURN void handleError(int result_code, String text = "");
    MAYBE_NORETURN bool openConnection();
    void closeConnection() noexcept;
    SearchResults search(const SearchParams & search_params);
--- a/src/Access/UsersConfigAccessStorage.cpp
+++ b/src/Access/UsersConfigAccessStorage.cpp
@ -228,6 +228,12 @@ namespace
            user->access.revokeGrantOption(AccessType::ALL);
        }

+        bool show_named_collections = config.getBool(user_config + ".show_named_collections", false);
+        if (!show_named_collections)
+        {
+            user->access.revoke(AccessType::SHOW_NAMED_COLLECTIONS);
+        }
+
        String default_database = config.getString(user_config + ".default_database", "");
        user->default_database = default_database;

--- a/src/AggregateFunctions/AggregateFunctionArgMinMax.h
+++ b/src/AggregateFunctions/AggregateFunctionArgMinMax.h
@ -13,6 +13,7 @@ struct Settings;
 namespace ErrorCodes
 {
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int CORRUPTED_DATA;
 }


@ -89,6 +90,13 @@ public:
    {
        this->data(place).result.read(buf, *serialization_res, arena);
        this->data(place).value.read(buf, *serialization_val, arena);
+        if (unlikely(this->data(place).value.has() != this->data(place).result.has()))
+            throw Exception(
+                ErrorCodes::CORRUPTED_DATA,
+                "Invalid state of the aggregate function {}: has_value ({}) != has_result ({})",
+                getName(),
+                this->data(place).value.has(),
+                this->data(place).result.has());
    }

    bool allocatesMemoryInArena() const override
--- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
+++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
@ -448,6 +448,34 @@ public:

 };

+struct Compatibility
+{
+    /// Old versions used to store terminating null-character in SingleValueDataString.
+    /// Then -WithTerminatingZero methods were removed from IColumn interface,
+    /// because these methods are quite dangerous and easy to misuse. It introduced incompatibility.
+    /// See https://github.com/ClickHouse/ClickHouse/pull/41431 and https://github.com/ClickHouse/ClickHouse/issues/42916
+    /// Here we keep these functions for compatibility.
+    /// It's safe because there's no way unsanitized user input (without \0 at the end) can reach these functions.
+
+    static StringRef getDataAtWithTerminatingZero(const ColumnString & column, size_t n)
+    {
+        auto res = column.getDataAt(n);
+        /// ColumnString always reserves extra byte for null-character after string.
+        /// But getDataAt returns StringRef without the null-character. Let's add it.
+        chassert(res.data[res.size] == '\0');
+        ++res.size;
+        return res;
+    }
+
+    static void insertDataWithTerminatingZero(ColumnString & column, const char * pos, size_t length)
+    {
+        /// String already has terminating null-character.
+        /// But insertData will add another one unconditionally. Trim existing null-character to avoid duplication.
+        chassert(0 < length);
+        chassert(pos[length - 1] == '\0');
+        column.insertData(pos, length - 1);
+    }
+};

 /** For strings. Short strings are stored in the object itself, and long strings are allocated separately.
  * NOTE It could also be suitable for arrays of numbers.
@ -477,20 +505,31 @@ public:
        return size >= 0;
    }

-    const char * getData() const
+private:
+    char * getDataMutable()
    {
        return size <= MAX_SMALL_STRING_SIZE ? small_data : large_data;
    }

+    const char * getData() const
+    {
+        const char * data_ptr = size <= MAX_SMALL_STRING_SIZE ? small_data : large_data;
+        /// It must always be terminated with null-character
+        chassert(0 < size);
+        chassert(data_ptr[size - 1] == '\0');
+        return data_ptr;
+    }
+
    StringRef getStringRef() const
    {
        return StringRef(getData(), size);
    }

+public:
    void insertResultInto(IColumn & to) const
    {
        if (has())
-            assert_cast<ColumnString &>(to).insertData(getData(), size);
+            Compatibility::insertDataWithTerminatingZero(assert_cast<ColumnString &>(to), getData(), size);
        else
            assert_cast<ColumnString &>(to).insertDefault();
    }
@ -502,44 +541,76 @@ public:
            buf.write(getData(), size);
    }

+    void allocateLargeDataIfNeeded(Int64 size_to_reserve, Arena * arena)
+    {
+        if (capacity < size_to_reserve)
+        {
+            capacity = static_cast<Int32>(roundUpToPowerOfTwoOrZero(size_to_reserve));
+            /// It might happen if the size was too big and the rounded value does not fit a size_t
+            if (unlikely(capacity < size_to_reserve))
+                throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "String size is too big ({})", size_to_reserve);
+
+            /// Don't free large_data here.
+            large_data = arena->alloc(capacity);
+        }
+    }
+
    void read(ReadBuffer & buf, const ISerialization & /*serialization*/, Arena * arena)
    {
        Int32 rhs_size;
        readBinary(rhs_size, buf);

-        if (rhs_size >= 0)
-        {
-            if (rhs_size <= MAX_SMALL_STRING_SIZE)
-            {
-                /// Don't free large_data here.
-
-                size = rhs_size;
-
-                if (size > 0)
-                    buf.readStrict(small_data, size);
-            }
-            else
-            {
-                if (capacity < rhs_size)
-                {
-                    capacity = static_cast<Int32>(roundUpToPowerOfTwoOrZero(rhs_size));
-                    /// It might happen if the size was too big and the rounded value does not fit a size_t
-                    if (unlikely(capacity < rhs_size))
-                        throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "String size is too big ({})", rhs_size);
-
-                    /// Don't free large_data here.
-                    large_data = arena->alloc(capacity);
-                }
-
-                size = rhs_size;
-                buf.readStrict(large_data, size);
-            }
-        }
-        else
+        if (rhs_size < 0)
        {
            /// Don't free large_data here.
            size = rhs_size;
+            return;
        }
+
+        if (rhs_size <= MAX_SMALL_STRING_SIZE)
+        {
+            /// Don't free large_data here.
+
+            size = rhs_size;
+
+            if (size > 0)
+                buf.readStrict(small_data, size);
+        }
+        else
+        {
+            /// Reserve one byte more for null-character
+            Int64 rhs_size_to_reserve = rhs_size;
+            rhs_size_to_reserve += 1; /// Avoid overflow
+            allocateLargeDataIfNeeded(rhs_size_to_reserve, arena);
+            size = rhs_size;
+            buf.readStrict(large_data, size);
+        }
+
+        /// Check if the string we read is null-terminated (getDataMutable does not have the assertion)
+        if (0 < size && getDataMutable()[size - 1] == '\0')
+            return;
+
+        /// It's not null-terminated, but it must be (for historical reasons). There are two variants:
+        /// - The value was serialized by one of the incompatible versions of ClickHouse. We had some range of versions
+        ///   that used to serialize SingleValueDataString without terminating '\0'. Let's just append it.
+        /// - An attacker sent crafted data. Sanitize it and append '\0'.
+        /// In all other cases the string must be already null-terminated.
+
+        /// NOTE We cannot add '\0' unconditionally, because it will be duplicated.
+        /// NOTE It's possible that a string that actually ends with '\0' was written by one of the incompatible versions.
+        ///      Unfortunately, we cannot distinguish it from normal string written by normal version.
+        ///      So such strings will be trimmed.
+
+        if (size == MAX_SMALL_STRING_SIZE)
+        {
+            /// Special case: We have to move value to large_data
+            allocateLargeDataIfNeeded(size + 1, arena);
+            memcpy(large_data, small_data, size);
+        }
+
+        /// We have enough space to append
+        ++size;
+        getDataMutable()[size - 1] = '\0';
    }

    /// Assuming to.has()
@ -557,13 +628,7 @@ public:
        }
        else
        {
-            if (capacity < value_size)
-            {
-                /// Don't free large_data here.
-                capacity = static_cast<Int32>(roundUpToPowerOfTwoOrZero(value_size));
-                large_data = arena->alloc(capacity);
-            }
-
+            allocateLargeDataIfNeeded(value_size, arena);
            size = value_size;
            memcpy(large_data, value.data, size);
        }
@ -571,7 +636,7 @@ public:

    void change(const IColumn & column, size_t row_num, Arena * arena)
    {
-        changeImpl(assert_cast<const ColumnString &>(column).getDataAt(row_num), arena);
+        changeImpl(Compatibility::getDataAtWithTerminatingZero(assert_cast<const ColumnString &>(column), row_num), arena);
    }

    void change(const Self & to, Arena * arena)
@ -620,7 +685,7 @@ public:

    bool changeIfLess(const IColumn & column, size_t row_num, Arena * arena)
    {
-        if (!has() || assert_cast<const ColumnString &>(column).getDataAt(row_num) < getStringRef())
+        if (!has() || Compatibility::getDataAtWithTerminatingZero(assert_cast<const ColumnString &>(column), row_num) < getStringRef())
        {
            change(column, row_num, arena);
            return true;
@ -642,7 +707,7 @@ public:

    bool changeIfGreater(const IColumn & column, size_t row_num, Arena * arena)
    {
-        if (!has() || assert_cast<const ColumnString &>(column).getDataAt(row_num) > getStringRef())
+        if (!has() || Compatibility::getDataAtWithTerminatingZero(assert_cast<const ColumnString &>(column), row_num) > getStringRef())
        {
            change(column, row_num, arena);
            return true;
@ -669,7 +734,7 @@ public:

    bool isEqualTo(const IColumn & column, size_t row_num) const
    {
-        return has() && assert_cast<const ColumnString &>(column).getDataAt(row_num) == getStringRef();
+        return has() && Compatibility::getDataAtWithTerminatingZero(assert_cast<const ColumnString &>(column), row_num) == getStringRef();
    }

    static bool allocatesMemoryInArena()
--- a/src/Analyzer/HashUtils.h
+++ b/src/Analyzer/HashUtils.h
@ -0,0 +1,60 @@
+#pragma once
+
+#include <Analyzer/IQueryTreeNode.h>
+
+namespace DB
+{
+
+/** This structure holds query tree node ptr and its hash. It can be used as hash map key to avoid unnecessary hash
+  * recalculations.
+  *
+  * Example of usage:
+  * std::unordered_map<QueryTreeNodeConstRawPtrWithHash, std::string> map;
+  */
+template <typename QueryTreeNodePtrType>
+struct QueryTreeNodeWithHash
+{
+    QueryTreeNodeWithHash(QueryTreeNodePtrType node_) /// NOLINT
+        : node(std::move(node_))
+        , hash(node->getTreeHash().first)
+    {}
+
+    QueryTreeNodePtrType node = nullptr;
+    size_t hash = 0;
+};
+
+template <typename T>
+inline bool operator==(const QueryTreeNodeWithHash<T> & lhs, const QueryTreeNodeWithHash<T> & rhs)
+{
+    return lhs.hash == rhs.hash && lhs.node->isEqual(*rhs.node);
+}
+
+template <typename T>
+inline bool operator!=(const QueryTreeNodeWithHash<T> & lhs, const QueryTreeNodeWithHash<T> & rhs)
+{
+    return !(lhs == rhs);
+}
+
+using QueryTreeNodePtrWithHash = QueryTreeNodeWithHash<QueryTreeNodePtr>;
+using QueryTreeNodeRawPtrWithHash = QueryTreeNodeWithHash<IQueryTreeNode *>;
+using QueryTreeNodeConstRawPtrWithHash = QueryTreeNodeWithHash<const IQueryTreeNode *>;
+
+using QueryTreeNodePtrWithHashSet = std::unordered_set<QueryTreeNodePtrWithHash>;
+using QueryTreeNodeConstRawPtrWithHashSet = std::unordered_set<QueryTreeNodeConstRawPtrWithHash>;
+
+template <typename Value>
+using QueryTreeNodePtrWithHashMap = std::unordered_map<QueryTreeNodePtrWithHash, Value>;
+
+template <typename Value>
+using QueryTreeNodeConstRawPtrWithHashMap = std::unordered_map<QueryTreeNodeConstRawPtrWithHash, Value>;
+
+}
+
+template <typename T>
+struct std::hash<DB::QueryTreeNodeWithHash<T>>
+{
+    size_t operator()(const DB::QueryTreeNodeWithHash<T> & node_with_hash) const
+    {
+        return node_with_hash.hash;
+    }
+};
--- a/src/Analyzer/Passes/FuseFunctionsPass.cpp
+++ b/src/Analyzer/Passes/FuseFunctionsPass.cpp
@ -8,6 +8,7 @@
 #include <Analyzer/InDepthQueryTreeVisitor.h>
 #include <Analyzer/FunctionNode.h>
 #include <Analyzer/ConstantNode.h>
+#include <Analyzer/HashUtils.h>

 #include <DataTypes/DataTypesNumber.h>
 #include <DataTypes/DataTypeArray.h>
@ -48,43 +49,24 @@ public:
            /// Do not apply for `count()` with without arguments or `count(*)`, only `count(x)` is supported.
            return;

-        mapping[QueryTreeNodeWithHash(argument_nodes[0])].push_back(&node);
+        argument_to_functions_mapping[argument_nodes[0]].push_back(&node);
    }

-    struct QueryTreeNodeWithHash
-    {
-        const QueryTreeNodePtr & node;
-        IQueryTreeNode::Hash hash;
-
-        explicit QueryTreeNodeWithHash(const QueryTreeNodePtr & node_)
-            : node(node_)
-            , hash(node->getTreeHash())
-        {}
-
-        bool operator==(const QueryTreeNodeWithHash & rhs) const
-        {
-            return hash == rhs.hash && node->isEqual(*rhs.node);
-        }
-
-        struct Hash
-        {
-            size_t operator() (const QueryTreeNodeWithHash & key) const { return key.hash.first ^ key.hash.second; }
-        };
-    };
-
    /// argument -> list of sum/count/avg functions with this argument
-    std::unordered_map<QueryTreeNodeWithHash, std::vector<QueryTreeNodePtr *>, QueryTreeNodeWithHash::Hash> mapping;
+    QueryTreeNodePtrWithHashMap<std::vector<QueryTreeNodePtr *>> argument_to_functions_mapping;

 private:
    std::unordered_set<String> names_to_collect;
 };

-QueryTreeNodePtr createResolvedFunction(ContextPtr context, const String & name, DataTypePtr result_type, QueryTreeNodes arguments)
+QueryTreeNodePtr createResolvedFunction(const ContextPtr & context, const String & name, const DataTypePtr & result_type, QueryTreeNodes arguments)
 {
    auto function_node = std::make_shared<FunctionNode>(name);
+
    auto function = FunctionFactory::instance().get(name, context);
    function_node->resolveAsFunction(std::move(function), result_type);
    function_node->getArguments().getNodes() = std::move(arguments);
+
    return function_node;
 }

@ -94,21 +76,20 @@ FunctionNodePtr createResolvedAggregateFunction(const String & name, const Query

    AggregateFunctionProperties properties;
    auto aggregate_function = AggregateFunctionFactory::instance().get(name, {argument->getResultType()}, parameters, properties);
-
    function_node->resolveAsAggregateFunction(aggregate_function, aggregate_function->getReturnType());
+    function_node->getArguments().getNodes() = { argument };

-    function_node->getArgumentsNode() = std::make_shared<ListNode>(QueryTreeNodes{argument});
    return function_node;
 }

-QueryTreeNodePtr createTupleElementFunction(ContextPtr context, DataTypePtr result_type, QueryTreeNodePtr argument, UInt64 index)
+QueryTreeNodePtr createTupleElementFunction(const ContextPtr & context, const DataTypePtr & result_type, QueryTreeNodePtr argument, UInt64 index)
 {
-    return createResolvedFunction(context, "tupleElement", result_type, {argument, std::make_shared<ConstantNode>(index)});
+    return createResolvedFunction(context, "tupleElement", result_type, {std::move(argument), std::make_shared<ConstantNode>(index)});
 }

-QueryTreeNodePtr createArrayElementFunction(ContextPtr context, DataTypePtr result_type, QueryTreeNodePtr argument, UInt64 index)
+QueryTreeNodePtr createArrayElementFunction(const ContextPtr & context, const DataTypePtr & result_type, QueryTreeNodePtr argument, UInt64 index)
 {
-    return createResolvedFunction(context, "arrayElement", result_type, {argument, std::make_shared<ConstantNode>(index)});
+    return createResolvedFunction(context, "arrayElement", result_type, {std::move(argument), std::make_shared<ConstantNode>(index)});
 }

 void replaceWithSumCount(QueryTreeNodePtr & node, const FunctionNodePtr & sum_count_node, ContextPtr context)
@ -151,6 +132,7 @@ FunctionNodePtr createFusedQuantilesNode(const std::vector<QueryTreeNodePtr *> n
 {
    Array parameters;
    parameters.reserve(nodes.size());
+
    for (const auto * node : nodes)
    {
        const FunctionNode & function_node = (*node)->as<const FunctionNode &>();
@ -172,6 +154,7 @@ FunctionNodePtr createFusedQuantilesNode(const std::vector<QueryTreeNodePtr *> n

        parameters.push_back(constant_value->getValue());
    }
+
    return createResolvedAggregateFunction("quantiles", argument, parameters);
 }

@ -181,7 +164,7 @@ void tryFuseSumCountAvg(QueryTreeNodePtr query_tree_node, ContextPtr context)
    FuseFunctionsVisitor visitor({"sum", "count", "avg"});
    visitor.visit(query_tree_node);

-    for (auto & [argument, nodes] : visitor.mapping)
+    for (auto & [argument, nodes] : visitor.argument_to_functions_mapping)
    {
        if (nodes.size() < 2)
            continue;
@ -199,24 +182,22 @@ void tryFuseQuantiles(QueryTreeNodePtr query_tree_node, ContextPtr context)
 {
    FuseFunctionsVisitor visitor_quantile({"quantile"});
    visitor_quantile.visit(query_tree_node);
-    for (auto & [argument, nodes] : visitor_quantile.mapping)
+
+    for (auto & [argument, nodes] : visitor_quantile.argument_to_functions_mapping)
    {
-        if (nodes.size() < 2)
+        size_t nodes_size = nodes.size();
+        if (nodes_size < 2)
            continue;

        auto quantiles_node = createFusedQuantilesNode(nodes, argument.node);
        auto result_array_type = std::dynamic_pointer_cast<const DataTypeArray>(quantiles_node->getResultType());
        if (!result_array_type)
-        {
            throw Exception(ErrorCodes::LOGICAL_ERROR,
                "Unexpected return type '{}' of function '{}', should be array",
                quantiles_node->getResultType(), quantiles_node->getFunctionName());
-        }

-        for (size_t i = 0; i < nodes.size(); ++i)
-        {
+        for (size_t i = 0; i < nodes_size; ++i)
            *nodes[i] = createArrayElementFunction(context, result_array_type->getNestedType(), quantiles_node, i + 1);
-        }
    }
 }

--- a/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp
+++ b/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp
@ -3,6 +3,7 @@
 #include <Analyzer/InDepthQueryTreeVisitor.h>
 #include <Analyzer/QueryNode.h>
 #include <Analyzer/SortNode.h>
+#include <Analyzer/HashUtils.h>

 namespace DB
 {
@ -10,35 +11,6 @@ namespace DB
 namespace
 {

-struct QueryTreeNodeWithHash
-{
-    explicit QueryTreeNodeWithHash(const IQueryTreeNode * node_)
-        : node(node_)
-        , hash(node->getTreeHash().first)
-    {}
-
-    const IQueryTreeNode * node = nullptr;
-    size_t hash = 0;
-};
-
-struct QueryTreeNodeWithHashHash
-{
-    size_t operator()(const QueryTreeNodeWithHash & node_with_hash) const
-    {
-        return node_with_hash.hash;
-    }
-};
-
-struct QueryTreeNodeWithHashEqualTo
-{
-    bool operator()(const QueryTreeNodeWithHash & lhs_node, const QueryTreeNodeWithHash & rhs_node) const
-    {
-        return lhs_node.hash == rhs_node.hash && lhs_node.node->isEqual(*rhs_node.node);
-    }
-};
-
-using QueryTreeNodeWithHashSet = std::unordered_set<QueryTreeNodeWithHash, QueryTreeNodeWithHashHash, QueryTreeNodeWithHashEqualTo>;
-
 class OrderByLimitByDuplicateEliminationVisitor : public InDepthQueryTreeVisitor<OrderByLimitByDuplicateEliminationVisitor>
 {
 public:
@ -93,7 +65,7 @@ public:
    }

 private:
-    QueryTreeNodeWithHashSet unique_expressions_nodes_set;
+    QueryTreeNodeConstRawPtrWithHashSet unique_expressions_nodes_set;
 };

 }
--- a/src/Analyzer/Passes/QueryAnalysisPass.cpp
+++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@ -67,6 +67,8 @@
 #include <Analyzer/InDepthQueryTreeVisitor.h>
 #include <Analyzer/QueryTreeBuilder.h>

+#include <Common/checkStackSize.h>
+
 namespace DB
 {

@ -517,7 +519,7 @@ public:

 private:
    QueryTreeNodes expressions;
-    std::unordered_map<std::string, std::vector<QueryTreeNodePtr>> alias_name_to_expressions;
+    std::unordered_map<std::string, QueryTreeNodes> alias_name_to_expressions;
 };

 /** Projection names is name of query tree node that is used in projection part of query node.
@ -1100,6 +1102,10 @@ private:

    static void validateJoinTableExpressionWithoutAlias(const QueryTreeNodePtr & join_node, const QueryTreeNodePtr & table_expression_node, IdentifierResolveScope & scope);

+    static void expandGroupByAll(QueryNode & query_tree_node_typed);
+
+    static std::pair<bool, UInt64> recursivelyCollectMaxOrdinaryExpressions(QueryTreeNodePtr & node, QueryTreeNodes & into);
+
    /// Resolve identifier functions

    static QueryTreeNodePtr tryResolveTableIdentifierFromDatabaseCatalog(const Identifier & table_identifier, ContextPtr context);
@ -1929,6 +1935,68 @@ void QueryAnalyzer::validateJoinTableExpressionWithoutAlias(const QueryTreeNodeP
            scope.scope_node->formatASTForErrorMessage());
 }

+std::pair<bool, UInt64> QueryAnalyzer::recursivelyCollectMaxOrdinaryExpressions(QueryTreeNodePtr & node, QueryTreeNodes & into)
+{
+    checkStackSize();
+
+    if (node->as<ColumnNode>())
+    {
+        into.push_back(node);
+        return {false, 1};
+    }
+
+    auto * function = node->as<FunctionNode>();
+
+    if (!function)
+        return {false, 0};
+
+    if (function->isAggregateFunction())
+        return {true, 0};
+
+    UInt64 pushed_children = 0;
+    bool has_aggregate = false;
+
+    for (auto & child : function->getArguments().getNodes())
+    {
+        auto [child_has_aggregate, child_pushed_children] = recursivelyCollectMaxOrdinaryExpressions(child, into);
+        has_aggregate |= child_has_aggregate;
+        pushed_children += child_pushed_children;
+    }
+
+    /// The current function is not aggregate function and there is no aggregate function in its arguments,
+    /// so use the current function to replace its arguments
+    if (!has_aggregate)
+    {
+        for (UInt64 i = 0; i < pushed_children; i++)
+            into.pop_back();
+
+        into.push_back(node);
+        pushed_children = 1;
+    }
+
+    return {has_aggregate, pushed_children};
+}
+
+/** Expand GROUP BY ALL by extracting all the SELECT-ed expressions that are not aggregate functions.
+  *
+  * For a special case that if there is a function having both aggregate functions and other fields as its arguments,
+  * the `GROUP BY` keys will contain the maximum non-aggregate fields we can extract from it.
+  *
+  * Example:
+  * SELECT substring(a, 4, 2), substring(substring(a, 1, 2), 1, count(b)) FROM t GROUP BY ALL
+  * will expand as
+  * SELECT substring(a, 4, 2), substring(substring(a, 1, 2), 1, count(b)) FROM t GROUP BY substring(a, 4, 2), substring(a, 1, 2)
+  */
+void QueryAnalyzer::expandGroupByAll(QueryNode & query_tree_node_typed)
+{
+    auto & group_by_nodes = query_tree_node_typed.getGroupBy().getNodes();
+    auto & projection_list = query_tree_node_typed.getProjection();
+
+    for (auto & node : projection_list.getNodes())
+        recursivelyCollectMaxOrdinaryExpressions(node, group_by_nodes);
+
+}
+

 /// Resolve identifier functions implementation

@ -2171,18 +2239,19 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier
        auto & alias_identifier_node = it->second->as<IdentifierNode &>();
        auto identifier = alias_identifier_node.getIdentifier();
        auto lookup_result = tryResolveIdentifier(IdentifierLookup{identifier, identifier_lookup.lookup_context}, scope, identifier_resolve_settings);
-        if (!lookup_result.isResolved())
+        if (!lookup_result.resolved_identifier)
        {
            std::unordered_set<Identifier> valid_identifiers;
            collectScopeWithParentScopesValidIdentifiersForTypoCorrection(identifier, scope, true, false, false, valid_identifiers);
-
            auto hints = collectIdentifierTypoHints(identifier, valid_identifiers);
-            throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, "Unknown {} identifier '{}' in scope {}{}",
-                toStringLowercase(IdentifierLookupContext::EXPRESSION),
+
+            throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, "Unknown {} identifier '{}'. In scope {}{}",
+                toStringLowercase(identifier_lookup.lookup_context),
                identifier.getFullName(),
                scope.scope_node->formatASTForErrorMessage(),
                getHintsErrorMessageSuffix(hints));
        }
+
        it->second = lookup_result.resolved_identifier;

        /** During collection of aliases if node is identifier and has alias, we cannot say if it is
@ -2193,9 +2262,9 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier
          * If we resolved identifier node as function, we must remove identifier node alias from
          * expression alias map.
          */
-        if (identifier_lookup.isExpressionLookup() && it->second)
+        if (identifier_lookup.isExpressionLookup())
            scope.alias_name_to_lambda_node.erase(identifier_bind_part);
-        else if (identifier_lookup.isFunctionLookup() && it->second)
+        else if (identifier_lookup.isFunctionLookup())
            scope.alias_name_to_expression_node.erase(identifier_bind_part);

        scope.expressions_in_resolve_process_stack.popNode();
@ -3203,11 +3272,9 @@ QueryAnalyzer::QueryTreeNodesWithNames QueryAnalyzer::resolveUnqualifiedMatcher(

        if (auto * array_join_node = table_expression->as<ArrayJoinNode>())
        {
-            size_t table_expressions_column_nodes_with_names_stack_size = table_expressions_column_nodes_with_names_stack.size();
-            if (table_expressions_column_nodes_with_names_stack_size < 1)
+            if (table_expressions_column_nodes_with_names_stack.empty())
                throw Exception(ErrorCodes::LOGICAL_ERROR,
-                    "Expected at least 1 table expressions on stack before ARRAY JOIN processing. Actual {}",
-                    table_expressions_column_nodes_with_names_stack_size);
+                    "Expected at least 1 table expressions on stack before ARRAY JOIN processing");

            auto & table_expression_column_nodes_with_names = table_expressions_column_nodes_with_names_stack.back();

@ -6006,6 +6073,9 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier
        node->removeAlias();
    }

+    if (query_node_typed.isGroupByAll())
+        expandGroupByAll(query_node_typed);
+
    /** Validate aggregates
      *
      * 1. Check that there are no aggregate functions and GROUPING function in JOIN TREE, WHERE, PREWHERE, in another aggregate functions.
--- a/src/Analyzer/QueryNode.cpp
+++ b/src/Analyzer/QueryNode.cpp
@ -54,6 +54,9 @@ void QueryNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, s
    if (is_group_by_with_totals)
        buffer << ", is_group_by_with_totals: " << is_group_by_with_totals;

+    if (is_group_by_all)
+        buffer << ", is_group_by_all: " << is_group_by_all;
+
    std::string group_by_type;
    if (is_group_by_with_rollup)
        group_by_type = "rollup";
@ -117,7 +120,7 @@ void QueryNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, s
        getWhere()->dumpTreeImpl(buffer, format_state, indent + 4);
    }

-    if (hasGroupBy())
+    if (!is_group_by_all && hasGroupBy())
    {
        buffer << '\n' << std::string(indent + 2, ' ') << "GROUP BY\n";
        getGroupBy().dumpTreeImpl(buffer, format_state, indent + 4);
@ -198,7 +201,8 @@ bool QueryNode::isEqualImpl(const IQueryTreeNode & rhs) const
        is_group_by_with_totals == rhs_typed.is_group_by_with_totals &&
        is_group_by_with_rollup == rhs_typed.is_group_by_with_rollup &&
        is_group_by_with_cube == rhs_typed.is_group_by_with_cube &&
-        is_group_by_with_grouping_sets == rhs_typed.is_group_by_with_grouping_sets;
+        is_group_by_with_grouping_sets == rhs_typed.is_group_by_with_grouping_sets &&
+        is_group_by_all == rhs_typed.is_group_by_all;
 }

 void QueryNode::updateTreeHashImpl(HashState & state) const
@ -226,6 +230,7 @@ void QueryNode::updateTreeHashImpl(HashState & state) const
    state.update(is_group_by_with_rollup);
    state.update(is_group_by_with_cube);
    state.update(is_group_by_with_grouping_sets);
+    state.update(is_group_by_all);

    if (constant_value)
    {
@ -251,6 +256,7 @@ QueryTreeNodePtr QueryNode::cloneImpl() const
    result_query_node->is_group_by_with_rollup = is_group_by_with_rollup;
    result_query_node->is_group_by_with_cube = is_group_by_with_cube;
    result_query_node->is_group_by_with_grouping_sets = is_group_by_with_grouping_sets;
+    result_query_node->is_group_by_all = is_group_by_all;
    result_query_node->cte_name = cte_name;
    result_query_node->projection_columns = projection_columns;
    result_query_node->constant_value = constant_value;
@ -267,6 +273,7 @@ ASTPtr QueryNode::toASTImpl() const
    select_query->group_by_with_rollup = is_group_by_with_rollup;
    select_query->group_by_with_cube = is_group_by_with_cube;
    select_query->group_by_with_grouping_sets = is_group_by_with_grouping_sets;
+    select_query->group_by_all = is_group_by_all;

    if (hasWith())
        select_query->setExpression(ASTSelectQuery::Expression::WITH, getWith().toAST());
@ -283,7 +290,7 @@ ASTPtr QueryNode::toASTImpl() const
    if (getWhere())
        select_query->setExpression(ASTSelectQuery::Expression::WHERE, getWhere()->toAST());

-    if (hasGroupBy())
+    if (!is_group_by_all && hasGroupBy())
        select_query->setExpression(ASTSelectQuery::Expression::GROUP_BY, getGroupBy().toAST());

    if (hasHaving())
--- a/src/Analyzer/QueryNode.h
+++ b/src/Analyzer/QueryNode.h
@ -176,6 +176,18 @@ public:
        is_group_by_with_grouping_sets = is_group_by_with_grouping_sets_value;
    }

+    /// Returns true, if query node has GROUP BY ALL modifier, false otherwise
+    bool isGroupByAll() const
+    {
+        return is_group_by_all;
+    }
+
+    /// Set query node GROUP BY ALL modifier value
+    void setIsGroupByAll(bool is_group_by_all_value)
+    {
+        is_group_by_all = is_group_by_all_value;
+    }
+
    /// Returns true if query node WITH section is not empty, false otherwise
    bool hasWith() const
    {
@ -580,6 +592,7 @@ private:
    bool is_group_by_with_rollup = false;
    bool is_group_by_with_cube = false;
    bool is_group_by_with_grouping_sets = false;
+    bool is_group_by_all = false;

    std::string cte_name;
    NamesAndTypes projection_columns;
--- a/src/Analyzer/QueryTreeBuilder.cpp
+++ b/src/Analyzer/QueryTreeBuilder.cpp
@ -215,6 +215,7 @@ QueryTreeNodePtr QueryTreeBuilder::buildSelectExpression(const ASTPtr & select_q
    current_query_tree->setIsGroupByWithCube(select_query_typed.group_by_with_cube);
    current_query_tree->setIsGroupByWithRollup(select_query_typed.group_by_with_rollup);
    current_query_tree->setIsGroupByWithGroupingSets(select_query_typed.group_by_with_grouping_sets);
+    current_query_tree->setIsGroupByAll(select_query_typed.group_by_all);
    current_query_tree->setOriginalAST(select_query);

    auto select_settings = select_query_typed.settings();
--- a/src/Backups/registerBackupEngineS3.cpp
+++ b/src/Backups/registerBackupEngineS3.cpp
@ -110,12 +110,12 @@ void registerBackupEngineS3(BackupFactory & factory)

        if (params.open_mode == IBackup::OpenMode::READ)
        {
-            auto reader = std::make_shared<BackupReaderS3>(S3::URI{Poco::URI{s3_uri}}, access_key_id, secret_access_key, params.context);
+            auto reader = std::make_shared<BackupReaderS3>(S3::URI{s3_uri}, access_key_id, secret_access_key, params.context);
            return std::make_unique<BackupImpl>(backup_name_for_logging, archive_params, params.base_backup_info, reader, params.context);
        }
        else
        {
-            auto writer = std::make_shared<BackupWriterS3>(S3::URI{Poco::URI{s3_uri}}, access_key_id, secret_access_key, params.context);
+            auto writer = std::make_shared<BackupWriterS3>(S3::URI{s3_uri}, access_key_id, secret_access_key, params.context);
            return std::make_unique<BackupImpl>(backup_name_for_logging, archive_params, params.base_backup_info, writer, params.context, params.is_internal_backup, params.backup_coordination, params.backup_uuid);
        }
 #else
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@ -1401,6 +1401,11 @@ try
    QueryPipeline pipeline(std::move(pipe));
    PullingAsyncPipelineExecutor executor(pipeline);

+    if (need_render_progress)
+    {
+        pipeline.setProgressCallback([this](const Progress & progress){ onProgress(progress); });
+    }
+
    Block block;
    while (executor.pull(block))
    {
@ -1445,12 +1450,6 @@ catch (...)

 void ClientBase::sendDataFromStdin(Block & sample, const ColumnsDescription & columns_description, ASTPtr parsed_query)
 {
-    if (need_render_progress)
-    {
-        /// Add callback to track reading from fd.
-        std_in.setProgressCallback(global_context);
-    }
-
    /// Send data read from stdin.
    try
    {
--- a/src/Client/ClientBase.h
+++ b/src/Client/ClientBase.h
@ -171,6 +171,11 @@ protected:

    void initTtyBuffer(ProgressOption progress);

+    /// Should be one of the first, to be destroyed the last,
+    /// since other members can use them.
+    SharedContextHolder shared_context;
+    ContextMutablePtr global_context;
+
    bool is_interactive = false; /// Use either interactive line editing interface or batch mode.
    bool is_multiquery = false;
    bool delayed_interactive = false;
@ -208,9 +213,6 @@ protected:
    /// Settings specified via command line args
    Settings cmd_settings;

-    SharedContextHolder shared_context;
-    ContextMutablePtr global_context;
-
    /// thread status should be destructed before shared context because it relies on process list.
    std::optional<ThreadStatus> thread_status;

--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@ -5,6 +5,7 @@
 #define APPLY_FOR_METRICS(M) \
    M(Query, "Number of executing queries") \
    M(Merge, "Number of executing background merges") \
+    M(Move, "Number of currently executing moves") \
    M(PartMutation, "Number of mutations (ALTER DELETE/UPDATE)") \
    M(ReplicatedFetch, "Number of data parts being fetched from replica") \
    M(ReplicatedSend, "Number of data parts being sent to replicas") \
--- a/src/Common/EventRateMeter.h
+++ b/src/Common/EventRateMeter.h
@ -27,6 +27,14 @@ public:
    /// NOTE: Adding events into distant past (further than `period`) must be avoided.
    void add(double now, double count)
    {
+        // Remove data for initial heating stage that can present at the beginning of a query.
+        // Otherwise it leads to wrong gradual increase of average value, turning algorithm into not very reactive.
+        if (count != 0.0 && ++data_points < 5)
+        {
+            start = events.time;
+            events = ExponentiallySmoothedAverage();
+        }
+
        if (now - period <= start) // precise counting mode
            events = ExponentiallySmoothedAverage(events.value + count, now);
        else // exponential smoothing mode
@ -51,6 +59,7 @@ public:
    {
        start = now;
        events = ExponentiallySmoothedAverage();
+        data_points = 0;
    }

 private:
@ -58,6 +67,7 @@ private:
    const double half_decay_time;
    double start; // Instant in past without events before it; when measurement started or reset
    ExponentiallySmoothedAverage events; // Estimated number of events in the last `period`
+    size_t data_points = 0;
 };

 }
--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@ -15,6 +15,7 @@
 #include <Common/formatReadable.h>
 #include <Common/filesystemHelpers.h>
 #include <Common/ErrorCodes.h>
+#include <Common/SensitiveDataMasker.h>
 #include <Common/LockMemoryExceptionInThread.h>
 #include <filesystem>

@ -63,11 +64,18 @@ void handle_error_code([[maybe_unused]] const std::string & msg, int code, bool
    ErrorCodes::increment(code, remote, msg, trace);
 }

-Exception::Exception(const std::string & msg, int code, bool remote_)
-    : Poco::Exception(msg, code)
+Exception::MessageMasked::MessageMasked(const std::string & msg_)
+    : msg(msg_)
+{
+    if (auto * masker = SensitiveDataMasker::getInstance())
+        masker->wipeSensitiveData(msg);
+}
+
+Exception::Exception(const MessageMasked & msg_masked, int code, bool remote_)
+    : Poco::Exception(msg_masked.msg, code)
    , remote(remote_)
 {
-    handle_error_code(msg, code, remote, getStackFramePointers());
+    handle_error_code(msg_masked.msg, code, remote, getStackFramePointers());
 }

 Exception::Exception(CreateFromPocoTag, const Poco::Exception & exc)
--- a/src/Common/Exception.h
+++ b/src/Common/Exception.h
@ -27,7 +27,19 @@ public:
    using FramePointers = std::vector<void *>;

    Exception() = default;
-    Exception(const std::string & msg, int code, bool remote_ = false);
+
+    // used to remove the sensitive information from exceptions if query_masking_rules is configured
+    struct MessageMasked
+    {
+        std::string msg;
+        MessageMasked(const std::string & msg_);
+    };
+
+    Exception(const MessageMasked & msg_masked, int code, bool remote_);
+
+    // delegating constructor to mask sensitive information from the message
+    Exception(const std::string & msg, int code, bool remote_ = false): Exception(MessageMasked(msg), code, remote_)
+    {}

    Exception(int code, const std::string & message)
        : Exception(message, code)
@ -54,12 +66,17 @@ public:
    template <typename... Args>
    void addMessage(fmt::format_string<Args...> format, Args &&... args)
    {
-        extendedMessage(fmt::format(format, std::forward<Args>(args)...));
+        addMessage(fmt::format(format, std::forward<Args>(args)...));
    }

    void addMessage(const std::string& message)
    {
-        extendedMessage(message);
+        addMessage(MessageMasked(message));
+    }
+
+    void addMessage(const MessageMasked & msg_masked)
+    {
+        extendedMessage(msg_masked.msg);
    }

    /// Used to distinguish local exceptions from the one that was received from remote node.
--- a/src/Common/MemoryTracker.cpp
+++ b/src/Common/MemoryTracker.cpp
@ -220,7 +220,7 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT
    Int64 limit_to_check = current_hard_limit;

 #if USE_JEMALLOC
-    if (level == VariableContext::Global)
+    if (level == VariableContext::Global && allow_use_jemalloc_memory.load(std::memory_order_relaxed))
    {
        /// Jemalloc arenas may keep some extra memory.
        /// This memory was substucted from RSS to decrease memory drift.
--- a/src/Common/MemoryTracker.h
+++ b/src/Common/MemoryTracker.h
@ -55,6 +55,7 @@ private:
    std::atomic<Int64> soft_limit {0};
    std::atomic<Int64> hard_limit {0};
    std::atomic<Int64> profiler_limit {0};
+    std::atomic_bool allow_use_jemalloc_memory {true};

    static std::atomic<Int64> free_memory_in_allocator_arenas;

@ -125,6 +126,10 @@ public:
    {
        return soft_limit.load(std::memory_order_relaxed);
    }
+    void setAllowUseJemallocMemory(bool value)
+    {
+        allow_use_jemalloc_memory.store(value, std::memory_order_relaxed);
+    }

    /** Set limit if it was not set.
      * Otherwise, set limit to new value, if new value is greater than previous limit.
--- a/src/Common/ProgressIndication.h
+++ b/src/Common/ProgressIndication.h
@ -90,7 +90,7 @@ private:

    bool write_progress_on_update = false;

-    EventRateMeter cpu_usage_meter{static_cast<double>(clock_gettime_ns()), 3'000'000'000 /*ns*/}; // average cpu utilization last 3 second
+    EventRateMeter cpu_usage_meter{static_cast<double>(clock_gettime_ns()), 2'000'000'000 /*ns*/}; // average cpu utilization last 2 second
    HostToThreadTimesMap thread_data;
    /// In case of all of the above:
    /// - clickhouse-local
--- a/src/Common/SipHash.h
+++ b/src/Common/SipHash.h
@ -189,6 +189,13 @@ public:
        finalize();
        return v0 ^ v1 ^ v2 ^ v3;
    }
+
+    UInt128 get128()
+    {
+        UInt128 res;
+        get128(res);
+        return res;
+    }
 };


@ -208,9 +215,7 @@ inline UInt128 sipHash128(const char * data, const size_t size)
 {
    SipHash hash;
    hash.update(data, size);
-    UInt128 res;
-    hash.get128(res);
-    return res;
+    return hash.get128();
 }

 inline UInt64 sipHash64(const char * data, const size_t size)
--- a/src/Common/TaskStatsInfoGetter.cpp
+++ b/src/Common/TaskStatsInfoGetter.cpp
@ -8,6 +8,7 @@

 #include "hasLinuxCapability.h"
 #include <base/unaligned.h>
+#include <Common/logger_useful.h>

 #include <cerrno>
 #include <cstdio>
@ -205,6 +206,20 @@ bool checkPermissionsImpl()
    {
        TaskStatsInfoGetter();
    }
+    catch (const Exception & e)
+    {
+        if (e.code() == ErrorCodes::NETLINK_ERROR)
+        {
+            /// This error happens all the time when running inside Docker - consider it ok,
+            /// don't create noise with this error.
+            LOG_DEBUG(&Poco::Logger::get(__PRETTY_FUNCTION__), "{}", getCurrentExceptionMessage(false));
+        }
+        else
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+        return false;
+    }
    catch (...)
    {
        tryLogCurrentException(__PRETTY_FUNCTION__);
--- a/src/Common/ThreadProfileEvents.h
+++ b/src/Common/ThreadProfileEvents.h
@ -1,6 +1,7 @@
 #pragma once

 #include <base/types.h>
+#include <base/getThreadId.h>
 #include <Common/ProfileEvents.h>
 #include <sys/time.h>
 #include <sys/resource.h>
@ -47,6 +48,8 @@ struct RUsageCounters
    UInt64 soft_page_faults = 0;
    UInt64 hard_page_faults = 0;

+    UInt64 thread_id = 0;
+
    RUsageCounters() = default;
    RUsageCounters(const ::rusage & rusage_, UInt64 real_time_)
    {
@ -61,6 +64,8 @@ struct RUsageCounters

        soft_page_faults = static_cast<UInt64>(rusage.ru_minflt);
        hard_page_faults = static_cast<UInt64>(rusage.ru_majflt);
+
+        thread_id = getThreadId();
    }

    static RUsageCounters current()
@ -78,6 +83,12 @@ struct RUsageCounters

    static void incrementProfileEvents(const RUsageCounters & prev, const RUsageCounters & curr, ProfileEvents::Counters & profile_events)
    {
+        chassert(prev.thread_id == curr.thread_id);
+        /// LONG_MAX is ~106751 days
+        chassert(curr.real_time - prev.real_time < LONG_MAX);
+        chassert(curr.user_time - prev.user_time < LONG_MAX);
+        chassert(curr.sys_time - prev.sys_time < LONG_MAX);
+
        profile_events.increment(ProfileEvents::RealTimeMicroseconds,   (curr.real_time - prev.real_time) / 1000U);
        profile_events.increment(ProfileEvents::UserTimeMicroseconds,   (curr.user_time - prev.user_time) / 1000U);
        profile_events.increment(ProfileEvents::SystemTimeMicroseconds, (curr.sys_time - prev.sys_time) / 1000U);
--- a/src/Common/ThreadStatus.h
+++ b/src/Common/ThreadStatus.h
@ -179,8 +179,8 @@ protected:
    /// Is used to send logs from logs_queue to client in case of fatal errors.
    std::function<void()> fatal_error_callback;

-    /// It is used to avoid enabling the query profiler when you have multiple ThreadStatus in the same thread
-    bool query_profiler_enabled = true;
+    /// See setInternalThread()
+    bool internal_thread = false;

    /// Requires access to query_id.
    friend class MemoryTrackerThreadSwitcher;
@ -225,11 +225,21 @@ public:
        return global_context.lock();
    }

-    void disableProfiling()
-    {
-        assert(!query_profiler_real && !query_profiler_cpu);
-        query_profiler_enabled = false;
-    }
+    /// "Internal" ThreadStatus is used for materialized views for separate
+    /// tracking into system.query_views_log
+    ///
+    /// You can have multiple internal threads, but only one non-internal with
+    /// the same thread_id.
+    ///
+    /// "Internal" thread:
+    /// - cannot have query profiler
+    ///   since the running (main query) thread should already have one
+    /// - should not try to obtain latest counter on detach
+    ///   because detaching of such threads will be done from a different
+    ///   thread_id, and some counters are not available (i.e. getrusage()),
+    ///   but anyway they are accounted correctly in the main ThreadStatus of a
+    ///   query.
+    void setInternalThread();

    /// Starts new query and create new thread group for it, current thread becomes master thread of the query
    void initializeQuery();
--- a/src/Coordination/KeeperSnapshotManagerS3.cpp
+++ b/src/Coordination/KeeperSnapshotManagerS3.cpp
@ -65,7 +65,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo
        auto auth_settings = S3::AuthSettings::loadFromConfig(config_prefix, config);

        auto endpoint = config.getString(config_prefix + ".endpoint");
-        auto new_uri = S3::URI{Poco::URI(endpoint)};
+        auto new_uri = S3::URI{endpoint};

        {
            std::lock_guard client_lock{snapshot_s3_client_mutex};
--- a/src/Core/Block.cpp
+++ b/src/Core/Block.cpp
@ -667,9 +667,15 @@ Names Block::getDataTypeNames() const
 }


-std::unordered_map<String, size_t> Block::getNamesToIndexesMap() const
+Block::NameMap Block::getNamesToIndexesMap() const
 {
-    return index_by_name;
+    NameMap res;
+    res.reserve(index_by_name.size());
+
+    for (const auto & [name, index] : index_by_name)
+        res[name] = index;
+
+    return res;
 }


--- a/src/Core/Block.h
+++ b/src/Core/Block.h
@ -5,6 +5,8 @@
 #include <Core/ColumnsWithTypeAndName.h>
 #include <Core/NamesAndTypes.h>

+#include <Common/HashTable/HashMap.h>
+
 #include <initializer_list>
 #include <list>
 #include <map>
@ -93,7 +95,10 @@ public:
    Names getNames() const;
    DataTypes getDataTypes() const;
    Names getDataTypeNames() const;
-    std::unordered_map<String, size_t> getNamesToIndexesMap() const;
+
+    /// Hash table match `column name -> position in the block`.
+    using NameMap = HashMap<StringRef, size_t, StringRefHash>;
+    NameMap getNamesToIndexesMap() const;

    Serializations getSerializations() const;

--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -851,6 +851,9 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, output_format_sql_insert_include_column_names, true, "Include column names in INSERT query", 0) \
    M(Bool, output_format_sql_insert_use_replace, false, "Use REPLACE statement instead of INSERT", 0) \
    M(Bool, output_format_sql_insert_quote_names, true, "Quote column names with '`' characters", 0) \
+    \
+    M(Bool, output_format_bson_string_as_string, false, "Use BSON String type instead of Binary for String columns.", 0) \
+    M(Bool, input_format_bson_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format BSON.", 0) \

 // End of FORMAT_FACTORY_SETTINGS
 // Please add settings non-related to formats into the COMMON_SETTINGS above.
--- a/src/Dictionaries/DictionaryStructure.cpp
+++ b/src/Dictionaries/DictionaryStructure.cpp
@ -284,7 +284,7 @@ std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
    std::unordered_set<String> attribute_names;
    std::vector<DictionaryAttribute> res_attributes;

-    const FormatSettings format_settings;
+    const FormatSettings format_settings = {};

    for (const auto & config_elem : config_elems)
    {
--- a/src/Dictionaries/ExternalQueryBuilder.h
+++ b/src/Dictionaries/ExternalQueryBuilder.h
@ -62,7 +62,7 @@ struct ExternalQueryBuilder


 private:
-    const FormatSettings format_settings;
+    const FormatSettings format_settings = {};

    void composeLoadAllQuery(WriteBuffer & out) const;

--- a/src/Dictionaries/MongoDBDictionarySource.cpp
+++ b/src/Dictionaries/MongoDBDictionarySource.cpp
@ -74,7 +74,6 @@ void registerDictionarySourceMongoDB(DictionarySourceFactory & factory)
 // Poco/MongoDB/BSONWriter.h:54: void writeCString(const std::string & value);
 // src/IO/WriteHelpers.h:146 #define writeCString(s, buf)
 #include <IO/WriteHelpers.h>
-#include <Processors/Transforms/MongoDBSource.h>


 namespace DB
--- a/src/Dictionaries/MongoDBDictionarySource.h
+++ b/src/Dictionaries/MongoDBDictionarySource.h
@ -1,5 +1,6 @@
 #pragma once

+#include <Processors/Transforms/MongoDBSource.h>
 #include <Core/Block.h>

 #include "DictionaryStructure.h"
--- a/src/Disks/IO/createReadBufferFromFileBase.cpp
+++ b/src/Disks/IO/createReadBufferFromFileBase.cpp
@ -42,7 +42,7 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
    if (read_hint.has_value())
        estimated_size = *read_hint;
    else if (file_size.has_value())
-        estimated_size = file_size.has_value() ? *file_size : 0;
+        estimated_size = *file_size;

    if (!existing_memory
        && settings.local_fs_method == LocalFSReadMethod::mmap
@ -158,7 +158,15 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
 #endif

    ProfileEvents::increment(ProfileEvents::CreatedReadBufferOrdinary);
-    return create(settings.local_fs_buffer_size, flags);
+
+    size_t buffer_size = settings.local_fs_buffer_size;
+    /// Check if the buffer can be smaller than default
+    if (read_hint.has_value() && *read_hint > 0 && *read_hint < buffer_size)
+        buffer_size = *read_hint;
+    if (file_size.has_value() && *file_size < buffer_size)
+        buffer_size = *file_size;
+
+    return create(buffer_size, flags);
 }

 }
--- a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp
@ -4,6 +4,7 @@
 #include <Common/getRandomASCIIString.h>
 #include <IO/WriteHelpers.h>
 #include <IO/ReadHelpers.h>
+#include <optional>
 #include <ranges>
 #include <filesystem>

@ -62,7 +63,7 @@ UnlinkFileOperation::UnlinkFileOperation(const std::string & path_, IDisk & disk

 void UnlinkFileOperation::execute(std::unique_lock<std::shared_mutex> &)
 {
-    auto buf = disk.readFile(path);
+    auto buf = disk.readFile(path, ReadSettings{}, std::nullopt, disk.getFileSize(path));
    readStringUntilEOF(prev_data, *buf);
    disk.removeFile(path);
 }
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@ -658,7 +658,7 @@ std::unique_ptr<IObjectStorage> S3ObjectStorage::cloneObjectStorage(
    return std::make_unique<S3ObjectStorage>(
        std::move(new_client), std::move(new_s3_settings),
        version_id, s3_capabilities, new_namespace,
-        S3::URI(Poco::URI(config.getString(config_prefix + ".endpoint"))).endpoint);
+        config.getString(config_prefix + ".endpoint"));
 }

 }
--- a/src/Disks/ObjectStorages/S3/diskSettings.cpp
+++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp
@ -137,7 +137,7 @@ std::unique_ptr<Aws::S3::S3Client> getClient(
        settings.request_settings.get_request_throttler,
        settings.request_settings.put_request_throttler);

-    S3::URI uri(Poco::URI(config.getString(config_prefix + ".endpoint")));
+    S3::URI uri(config.getString(config_prefix + ".endpoint"));
    if (uri.key.back() != '/')
        throw Exception("S3 path must ends with '/', but '" + uri.key + "' doesn't.", ErrorCodes::BAD_ARGUMENTS);

--- a/src/Disks/ObjectStorages/S3/registerDiskS3.cpp
+++ b/src/Disks/ObjectStorages/S3/registerDiskS3.cpp
@ -104,7 +104,7 @@ void registerDiskS3(DiskFactory & factory, bool global_skip_access_check)
        ContextPtr context,
        const DisksMap & /*map*/) -> DiskPtr
    {
-        S3::URI uri(Poco::URI(config.getString(config_prefix + ".endpoint")));
+        S3::URI uri(config.getString(config_prefix + ".endpoint"));

        if (uri.key.empty())
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "No key in S3 uri: {}", uri.uri.toString());
--- a/src/Formats/BSONTypes.cpp
+++ b/src/Formats/BSONTypes.cpp
@ -0,0 +1,106 @@
+#include <Formats/BSONTypes.h>
+#include <Common/Exception.h>
+#include <Common/hex.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int UNKNOWN_TYPE;
+}
+
+static std::string byteToHexString(uint8_t byte)
+{
+    return "0x" + getHexUIntUppercase(byte);
+}
+
+BSONType getBSONType(uint8_t value)
+{
+    if ((value >= 0x01 && value <= 0x13) || value == 0xFF || value == 0x7f)
+        return BSONType(value);
+
+    throw Exception(ErrorCodes::UNKNOWN_TYPE, "Unknown BSON type: {}", byteToHexString(value));
+}
+
+BSONBinarySubtype getBSONBinarySubtype(uint8_t value)
+{
+    if (value <= 0x07)
+        return BSONBinarySubtype(value);
+
+    throw Exception(ErrorCodes::UNKNOWN_TYPE, "Unknown BSON binary subtype: {}", byteToHexString(value));
+}
+
+std::string getBSONTypeName(BSONType type)
+{
+    switch (type)
+    {
+        case BSONType::BINARY:
+            return "Binary";
+        case BSONType::SYMBOL:
+            return "Symbol";
+        case BSONType::ARRAY:
+            return "Array";
+        case BSONType::DOCUMENT:
+            return "Document";
+        case BSONType::TIMESTAMP:
+            return "Timestamp";
+        case BSONType::INT64:
+            return "Int64";
+        case BSONType::INT32:
+            return "Int32";
+        case BSONType::BOOL:
+            return "Bool";
+        case BSONType::DOUBLE:
+            return "Double";
+        case BSONType::STRING:
+            return "String";
+        case BSONType::DECIMAL128:
+            return "Decimal128";
+        case BSONType::JAVA_SCRIPT_CODE_W_SCOPE:
+            return "JavaScript code w/ scope";
+        case BSONType::JAVA_SCRIPT_CODE:
+            return "JavaScript code";
+        case BSONType::DB_POINTER:
+            return "DBPointer";
+        case BSONType::REGEXP:
+            return "Regexp";
+        case BSONType::DATETIME:
+            return "Datetime";
+        case BSONType::OBJECT_ID:
+            return "ObjectId";
+        case BSONType::UNDEFINED:
+            return "Undefined";
+        case BSONType::NULL_VALUE:
+            return "Null";
+        case BSONType::MAX_KEY:
+            return "Max key";
+        case BSONType::MIN_KEY:
+            return "Min key";
+    }
+}
+
+std::string getBSONBinarySubtypeName(BSONBinarySubtype subtype)
+{
+    switch (subtype)
+    {
+        case BSONBinarySubtype::BINARY:
+            return "Binary";
+        case BSONBinarySubtype::FUNCTION:
+            return "Function";
+        case BSONBinarySubtype::BINARY_OLD:
+            return "Binary (Old)";
+        case BSONBinarySubtype::UUID_OLD:
+            return "UUID (Old)";
+        case BSONBinarySubtype::UUID:
+            return "UUID";
+        case BSONBinarySubtype::MD5:
+            return "MD5";
+        case BSONBinarySubtype::ENCRYPTED_BSON_VALUE:
+            return "Encrypted BSON value";
+        case BSONBinarySubtype::COMPRESSED_BSON_COLUMN:
+            return "Compressed BSON column";
+    }
+}
+
+}
--- a/src/Formats/BSONTypes.h
+++ b/src/Formats/BSONTypes.h
@ -0,0 +1,57 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+namespace DB
+{
+
+static const uint8_t BSON_DOCUMENT_END = 0x00;
+using BSONSizeT = uint32_t;
+static const BSONSizeT MAX_BSON_SIZE = std::numeric_limits<BSONSizeT>::max();
+
+/// See details on https://bsonspec.org/spec.html
+enum class BSONType
+{
+    DOUBLE = 0x01,
+    STRING = 0x02,
+    DOCUMENT = 0x03,
+    ARRAY = 0x04,
+    BINARY = 0x05,
+    UNDEFINED = 0x06,
+    OBJECT_ID = 0x07,
+    BOOL = 0x08,
+    DATETIME = 0x09,
+    NULL_VALUE = 0x0A,
+    REGEXP = 0x0B,
+    DB_POINTER = 0x0C,
+    JAVA_SCRIPT_CODE = 0x0D,
+    SYMBOL = 0x0E,
+    JAVA_SCRIPT_CODE_W_SCOPE = 0x0F,
+    INT32 = 0x10,
+    TIMESTAMP = 0x11,
+    INT64 = 0x12,
+    DECIMAL128 = 0x13,
+    MIN_KEY = 0xFF,
+    MAX_KEY = 0x7F,
+};
+
+enum class BSONBinarySubtype
+{
+    BINARY = 0x00,
+    FUNCTION = 0x01,
+    BINARY_OLD = 0x02,
+    UUID_OLD = 0x03,
+    UUID = 0x04,
+    MD5 = 0x05,
+    ENCRYPTED_BSON_VALUE = 0x06,
+    COMPRESSED_BSON_COLUMN = 0x07,
+};
+
+BSONType getBSONType(uint8_t value);
+std::string getBSONTypeName(BSONType type);
+
+BSONBinarySubtype getBSONBinarySubtype(uint8_t value);
+std::string getBSONBinarySubtypeName(BSONBinarySubtype subtype);
+
+}
--- a/src/Formats/ColumnMapping.cpp
+++ b/src/Formats/ColumnMapping.cpp
@ -18,7 +18,7 @@ void ColumnMapping::setupByHeader(const Block & header)
 }

 void ColumnMapping::addColumns(
-    const Names & column_names, const std::unordered_map<String, size_t> & column_indexes_by_names, const FormatSettings & settings)
+    const Names & column_names, const Block::NameMap & column_indexes_by_names, const FormatSettings & settings)
 {
    std::vector<bool> read_columns(column_indexes_by_names.size(), false);

@ -26,8 +26,8 @@ void ColumnMapping::addColumns(
    {
        names_of_columns.push_back(name);

-        const auto column_it = column_indexes_by_names.find(name);
-        if (column_it == column_indexes_by_names.end())
+        const auto * column_it = column_indexes_by_names.find(name);
+        if (!column_it)
        {
            if (settings.skip_unknown_fields)
            {
@ -41,7 +41,7 @@ void ColumnMapping::addColumns(
                name, column_indexes_for_input_fields.size());
        }

-        const auto column_index = column_it->second;
+        const auto column_index = column_it->getMapped();

        if (read_columns[column_index])
            throw Exception("Duplicate field found while parsing format header: " + name, ErrorCodes::INCORRECT_DATA);
--- a/src/Formats/ColumnMapping.h
+++ b/src/Formats/ColumnMapping.h
@ -28,7 +28,7 @@ struct ColumnMapping
    void setupByHeader(const Block & header);

    void addColumns(
-        const Names & column_names, const std::unordered_map<String, size_t> & column_indexes_by_names, const FormatSettings & settings);
+        const Names & column_names, const Block::NameMap & column_indexes_by_names, const FormatSettings & settings);

    void insertDefaultsForNotSeenColumns(MutableColumns & columns, std::vector<UInt8> & read_columns);
 };
--- a/src/Formats/EscapingRuleUtils.cpp
+++ b/src/Formats/EscapingRuleUtils.cpp
@ -834,17 +834,23 @@ DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::E
    return data_types;
 }

+String getAdditionalFormatInfoForAllRowBasedFormats(const FormatSettings & settings)
+{
+    return fmt::format(
+        "schema_inference_hints={}, max_rows_to_read_for_schema_inference={}",
+        settings.schema_inference_hints,
+        settings.max_rows_to_read_for_schema_inference);
+}
+
 String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule)
 {
-    String result;
+    String result = getAdditionalFormatInfoForAllRowBasedFormats(settings);
    /// First, settings that are common for all text formats:
-    result = fmt::format(
-        "schema_inference_hints={}, try_infer_integers={}, try_infer_dates={}, try_infer_datetimes={}, max_rows_to_read_for_schema_inference={}",
-        settings.schema_inference_hints,
+    result += fmt::format(
+        ", try_infer_integers={}, try_infer_dates={}, try_infer_datetimes={}",
        settings.try_infer_integers,
        settings.try_infer_dates,
-        settings.try_infer_datetimes,
-        settings.max_rows_to_read_for_schema_inference);
+        settings.try_infer_datetimes);

    /// Second, format-specific settings:
    switch (escaping_rule)
--- a/src/Formats/EscapingRuleUtils.h
+++ b/src/Formats/EscapingRuleUtils.h
@ -77,6 +77,7 @@ void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, c
 void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, const std::unordered_set<const IDataType *> * numbers_parsed_from_json_strings = nullptr);
 void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings);

+String getAdditionalFormatInfoForAllRowBasedFormats(const FormatSettings & settings);
 String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule);

 void checkSupportedDelimiterAfterField(FormatSettings::EscapingRule escaping_rule, const String & delimiter, const DataTypePtr & type);
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -178,6 +178,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
    format_settings.try_infer_integers = settings.input_format_try_infer_integers;
    format_settings.try_infer_dates = settings.input_format_try_infer_dates;
    format_settings.try_infer_datetimes = settings.input_format_try_infer_datetimes;
+    format_settings.bson.output_string_as_string = settings.output_format_bson_string_as_string;
+    format_settings.bson.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_bson_skip_fields_with_unsupported_types_in_schema_inference;

    /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
    if (format_settings.schema.is_server)
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -303,6 +303,12 @@ struct FormatSettings
        bool use_replace = false;
        bool quote_names = true;
    } sql_insert;
+
+    struct
+    {
+        bool output_string_as_string;
+        bool skip_fields_with_unsupported_types_in_schema_inference;
+    } bson;
 };

 }
--- a/src/Formats/JSONUtils.cpp
+++ b/src/Formats/JSONUtils.cpp
@ -231,7 +231,14 @@ namespace JSONUtils
            {
                auto type = getDataTypeFromFieldImpl(key_value_pair.second, settings, numbers_parsed_from_json_strings);
                if (!type)
+                {
+                    /// If we couldn't infer nested type and Object type is not enabled,
+                    /// we can't determine the type of this JSON field.
+                    if (!settings.json.try_infer_objects)
+                        return nullptr;
+
                    continue;
+                }

                if (settings.json.try_infer_objects && isObject(type))
                    return std::make_shared<DataTypeObject>("json", true);
--- a/src/Formats/registerFormats.cpp
+++ b/src/Formats/registerFormats.cpp
@ -19,6 +19,7 @@ void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory);
 void registerFileSegmentationEngineHiveText(FormatFactory & factory);
 #endif
 void registerFileSegmentationEngineLineAsString(FormatFactory & factory);
+void registerFileSegmentationEngineBSONEachRow(FormatFactory & factory);

 /// Formats for both input/output.

@ -49,6 +50,8 @@ void registerInputFormatJSONColumns(FormatFactory & factory);
 void registerOutputFormatJSONColumns(FormatFactory & factory);
 void registerInputFormatJSONCompactColumns(FormatFactory & factory);
 void registerOutputFormatJSONCompactColumns(FormatFactory & factory);
+void registerInputFormatBSONEachRow(FormatFactory & factory);
+void registerOutputFormatBSONEachRow(FormatFactory & factory);
 void registerInputFormatJSONColumnsWithMetadata(FormatFactory & factory);
 void registerOutputFormatJSONColumnsWithMetadata(FormatFactory & factory);
 void registerInputFormatProtobuf(FormatFactory & factory);
@ -136,7 +139,7 @@ void registerTSKVSchemaReader(FormatFactory & factory);
 void registerValuesSchemaReader(FormatFactory & factory);
 void registerTemplateSchemaReader(FormatFactory & factory);
 void registerMySQLSchemaReader(FormatFactory & factory);
-
+void registerBSONEachRowSchemaReader(FormatFactory & factory);

 void registerFileExtensions(FormatFactory & factory);

@ -155,6 +158,7 @@ void registerFormats()
    registerFileSegmentationEngineHiveText(factory);
 #endif
    registerFileSegmentationEngineLineAsString(factory);
+    registerFileSegmentationEngineBSONEachRow(factory);


    registerInputFormatNative(factory);
@ -184,6 +188,8 @@ void registerFormats()
    registerOutputFormatJSONColumns(factory);
    registerInputFormatJSONCompactColumns(factory);
    registerOutputFormatJSONCompactColumns(factory);
+    registerInputFormatBSONEachRow(factory);
+    registerOutputFormatBSONEachRow(factory);
    registerInputFormatJSONColumnsWithMetadata(factory);
    registerOutputFormatJSONColumnsWithMetadata(factory);
    registerInputFormatProtobuf(factory);
@ -267,6 +273,7 @@ void registerFormats()
    registerValuesSchemaReader(factory);
    registerTemplateSchemaReader(factory);
    registerMySQLSchemaReader(factory);
+    registerBSONEachRowSchemaReader(factory);
 }

 }
--- a/src/Functions/filesystem.cpp
+++ b/src/Functions/filesystem.cpp
@ -1,31 +1,40 @@
-#include <Functions/IFunction.h>
-#include <Functions/FunctionFactory.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnVector.h>
 #include <DataTypes/DataTypesNumber.h>
+#include <Disks/IDisk.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/IFunction.h>
 #include <Interpreters/Context.h>
-#include <filesystem>
 #include <Poco/Util/AbstractConfiguration.h>

 namespace DB
 {
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_COLUMN;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int UNKNOWN_DISK;
+}
 namespace
 {

 struct FilesystemAvailable
 {
    static constexpr auto name = "filesystemAvailable";
-    static std::uintmax_t get(const std::filesystem::space_info & spaceinfo) { return spaceinfo.available; }
+    static std::uintmax_t get(const DiskPtr & disk) { return disk->getAvailableSpace(); }
 };

-struct FilesystemFree
+struct FilesystemUnreserved
 {
-    static constexpr auto name = "filesystemFree";
-    static std::uintmax_t get(const std::filesystem::space_info & spaceinfo) { return spaceinfo.free; }
+    static constexpr auto name = "filesystemUnreserved";
+    static std::uintmax_t get(const DiskPtr & disk) { return disk->getUnreservedSpace(); }
 };

 struct FilesystemCapacity
 {
    static constexpr auto name = "filesystemCapacity";
-    static std::uintmax_t get(const std::filesystem::space_info & spaceinfo) { return spaceinfo.capacity; }
+    static std::uintmax_t get(const DiskPtr & disk) { return disk->getTotalSpace(); }
 };

 template <typename Impl>
@ -34,34 +43,72 @@ class FilesystemImpl : public IFunction
 public:
    static constexpr auto name = Impl::name;

-    static FunctionPtr create(ContextPtr context)
-    {
-        return std::make_shared<FilesystemImpl<Impl>>(std::filesystem::space(context->getPath()));
-    }
+    static FunctionPtr create(ContextPtr context_) { return std::make_shared<FilesystemImpl<Impl>>(context_); }
+
+    explicit FilesystemImpl(ContextPtr context_) : context(context_) { }
+
+    bool useDefaultImplementationForConstants() const override { return true; }

    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override
    {
        return false;
    }

-    explicit FilesystemImpl(std::filesystem::space_info spaceinfo_) : spaceinfo(spaceinfo_) { }
-
    String getName() const override { return name; }
+
+    bool isVariadic() const override { return true; }
+
    size_t getNumberOfArguments() const override { return 0; }
    bool isDeterministic() const override { return false; }

-    DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments*/) const override
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
    {
+        if (arguments.size() > 1)
+        {
+            throw Exception("Arguments size of function " + getName() + " should be 0 or 1", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+        }
+        if (arguments.size() == 1 && !isStringOrFixedString(arguments[0]))
+        {
+            throw Exception(
+                "Arguments of function " + getName() + " should be String or FixedString", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+        }
        return std::make_shared<DataTypeUInt64>();
    }

-    ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
    {
-        return DataTypeUInt64().createColumnConst(input_rows_count, static_cast<UInt64>(Impl::get(spaceinfo)));
+        if (arguments.empty())
+        {
+            auto disk = context->getDisk("default");
+            return DataTypeUInt64().createColumnConst(input_rows_count, Impl::get(disk));
+        }
+        else
+        {
+            auto col = arguments[0].column;
+            if (const ColumnString * col_str = checkAndGetColumn<ColumnString>(col.get()))
+            {
+                auto disk_map = context->getDisksMap();
+
+                auto col_res = ColumnVector<UInt64>::create(col_str->size());
+                auto & data = col_res->getData();
+                for (size_t i = 0; i < col_str->size(); ++i)
+                {
+                    auto disk_name = col_str->getDataAt(i).toString();
+                    if (auto it = disk_map.find(disk_name); it != disk_map.end())
+                        data[i] = Impl::get(it->second);
+                    else
+                        throw Exception(
+                            "Unknown disk name " + disk_name + " while execute function " + getName(), ErrorCodes::UNKNOWN_DISK);
+                }
+                return col_res;
+            }
+            throw Exception(
+                "Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
+        }
    }

 private:
-    std::filesystem::space_info spaceinfo;
+    ContextPtr context;
 };

 }
@ -70,7 +117,7 @@ REGISTER_FUNCTION(Filesystem)
 {
    factory.registerFunction<FilesystemImpl<FilesystemAvailable>>();
    factory.registerFunction<FilesystemImpl<FilesystemCapacity>>();
-    factory.registerFunction<FilesystemImpl<FilesystemFree>>();
+    factory.registerFunction<FilesystemImpl<FilesystemUnreserved>>();
 }

 }
--- a/src/Functions/if.cpp
+++ b/src/Functions/if.cpp
@ -1016,6 +1016,7 @@ public:
    size_t getNumberOfArguments() const override { return 3; }

    bool useDefaultImplementationForNulls() const override { return false; }
+    bool useDefaultImplementationForNothing() const override { return false; }
    bool isShortCircuit(ShortCircuitSettings & settings, size_t /*number_of_arguments*/) const override
    {
        settings.enable_lazy_execution_for_first_argument = false;
--- a/src/Functions/multiIf.cpp
+++ b/src/Functions/multiIf.cpp
@ -50,6 +50,7 @@ public:
    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
    size_t getNumberOfArguments() const override { return 0; }
    bool useDefaultImplementationForNulls() const override { return false; }
+    bool useDefaultImplementationForNothing() const override { return false; }

    ColumnNumbers getArgumentsThatDontImplyNullableReturnType(size_t number_of_arguments) const override
    {
--- a/src/Functions/runningConcurrency.cpp
+++ b/src/Functions/runningConcurrency.cpp
@ -57,7 +57,7 @@ namespace DB

                if (unlikely(begin > end))
                {
-                    const FormatSettings default_format;
+                    const FormatSettings default_format{};
                    WriteBufferFromOwnString buf_begin, buf_end;
                    begin_serializaion->serializeTextQuoted(*(arguments[0].column), i, buf_begin, default_format);
                    end_serialization->serializeTextQuoted(*(arguments[1].column), i, buf_end, default_format);
--- a/src/IO/ReadHelpers.cpp
+++ b/src/IO/ReadHelpers.cpp
@ -1278,6 +1278,25 @@ void skipToUnescapedNextLineOrEOF(ReadBuffer & buf)
    }
 }

+void skipNullTerminated(ReadBuffer & buf)
+{
+    while (!buf.eof())
+    {
+        char * next_pos = find_first_symbols<'\0'>(buf.position(), buf.buffer().end());
+        buf.position() = next_pos;
+
+        if (!buf.hasPendingData())
+            continue;
+
+        if (*buf.position() == '\0')
+        {
+            ++buf.position();
+            return;
+        }
+    }
+}
+
+
 void saveUpToPosition(ReadBuffer & in, Memory<> & memory, char * current)
 {
    assert(current >= in.position());
--- a/src/IO/ReadHelpers.h
+++ b/src/IO/ReadHelpers.h
@ -1448,6 +1448,8 @@ void skipToCarriageReturnOrEOF(ReadBuffer & buf);
 /// Skip to next character after next unescaped \n. If no \n in stream, skip to end. Does not throw on invalid escape sequences.
 void skipToUnescapedNextLineOrEOF(ReadBuffer & buf);

+/// Skip to next character after next \0. If no \0 in stream, skip to end.
+void skipNullTerminated(ReadBuffer & buf);

 /** This function just copies the data from buffer's internal position (in.position())
  * to current position (from arguments) into memory.
--- a/src/IO/S3/tests/gtest_aws_s3_client.cpp
+++ b/src/IO/S3/tests/gtest_aws_s3_client.cpp
@ -76,7 +76,7 @@ TEST(IOTestAwsS3Client, AppendExtraSSECHeaders)

    DB::RemoteHostFilter remote_host_filter;
    unsigned int s3_max_redirects = 100;
-    DB::S3::URI uri(Poco::URI(http.getUrl() + "/IOTestAwsS3ClientAppendExtraHeaders/test.txt"));
+    DB::S3::URI uri(http.getUrl() + "/IOTestAwsS3ClientAppendExtraHeaders/test.txt");
    String access_key_id = "ACCESS_KEY_ID";
    String secret_access_key = "SECRET_ACCESS_KEY";
    String region = "us-east-1";
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@ -759,7 +759,7 @@ namespace S3
            put_request_throttler);
    }

-    URI::URI(const Poco::URI & uri_)
+    URI::URI(const std::string & uri_)
    {
        /// Case when bucket name represented in domain name of S3 URL.
        /// E.g. (https://bucket-name.s3.Region.amazonaws.com/key)
@ -777,16 +777,32 @@ namespace S3
        static constexpr auto OBS = "OBS";
        static constexpr auto OSS = "OSS";

-        uri = uri_;
+        uri = Poco::URI(uri_);
+
        storage_name = S3;

        if (uri.getHost().empty())
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI.");

        /// Extract object version ID from query string.
+        bool has_version_id = false;
        for (const auto & [query_key, query_value] : uri.getQueryParameters())
            if (query_key == "versionId")
+            {
                version_id = query_value;
+                has_version_id = true;
+            }
+
+        /// Poco::URI will ignore '?' when parsing the path, but if there is a vestionId in the http parameter,
+        /// '?' can not be used as a wildcard, otherwise it will be ambiguous.
+        /// If no "vertionId" in the http parameter, '?' can be used as a wildcard.
+        /// It is necessary to encode '?' to avoid deletion during parsing path.
+        if (!has_version_id && uri_.find('?') != String::npos)
+        {
+            String uri_with_question_mark_encode;
+            Poco::URI::encode(uri_, "?", uri_with_question_mark_encode);
+            uri = Poco::URI(uri_with_question_mark_encode);
+        }

        String name;
        String endpoint_authority_from_uri;
--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@ -119,8 +119,7 @@ struct URI

    bool is_virtual_hosted_style;

-    explicit URI(const Poco::URI & uri_);
-    explicit URI(const std::string & uri_) : URI(Poco::URI(uri_)) {}
+    explicit URI(const std::string & uri_);

    static void validateBucket(const String & bucket, const Poco::URI & uri);
 };
--- a/src/IO/tests/gtest_s3_uri.cpp
+++ b/src/IO/tests/gtest_s3_uri.cpp
@ -20,55 +20,55 @@ struct TestCase
 };

 const TestCase TestCases[] = {
-    {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data")),
+    {S3::URI("https://bucketname.s3.us-east-2.amazonaws.com/data"),
     "https://s3.us-east-2.amazonaws.com",
     "bucketname",
     "data",
     "",
     true},
-    {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?firstKey=someKey&secondKey=anotherKey")),
+    {S3::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?firstKey=someKey&secondKey=anotherKey"),
+     "https://s3.us-east-2.amazonaws.com",
+     "bucketname",
+     "data?firstKey=someKey&secondKey=anotherKey",
+     "",
+     true},
+    {S3::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId=testVersionId&anotherKey=someOtherKey"),
+     "https://s3.us-east-2.amazonaws.com",
+     "bucketname",
+     "data",
+     "testVersionId",
+     true},
+    {S3::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?firstKey=someKey&versionId=testVersionId&anotherKey=someOtherKey"),
+     "https://s3.us-east-2.amazonaws.com",
+     "bucketname",
+     "data",
+     "testVersionId",
+     true},
+    {S3::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?anotherKey=someOtherKey&versionId=testVersionId"),
+     "https://s3.us-east-2.amazonaws.com",
+     "bucketname",
+     "data",
+     "testVersionId",
+     true},
+    {S3::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId=testVersionId"),
+     "https://s3.us-east-2.amazonaws.com",
+     "bucketname",
+     "data",
+     "testVersionId",
+     true},
+    {S3::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId="),
     "https://s3.us-east-2.amazonaws.com",
     "bucketname",
     "data",
     "",
     true},
-    {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId=testVersionId&anotherKey=someOtherKey")),
-     "https://s3.us-east-2.amazonaws.com",
-     "bucketname",
-     "data",
-     "testVersionId",
-     true},
-    {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?firstKey=someKey&versionId=testVersionId&anotherKey=someOtherKey")),
-     "https://s3.us-east-2.amazonaws.com",
-     "bucketname",
-     "data",
-     "testVersionId",
-     true},
-    {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?anotherKey=someOtherKey&versionId=testVersionId")),
-     "https://s3.us-east-2.amazonaws.com",
-     "bucketname",
-     "data",
-     "testVersionId",
-     true},
-    {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId=testVersionId")),
-     "https://s3.us-east-2.amazonaws.com",
-     "bucketname",
-     "data",
-     "testVersionId",
-     true},
-    {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId=")),
+    {S3::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId&"),
     "https://s3.us-east-2.amazonaws.com",
     "bucketname",
     "data",
     "",
     true},
-    {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId&")),
-     "https://s3.us-east-2.amazonaws.com",
-     "bucketname",
-     "data",
-     "",
-     true},
-    {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId")),
+    {S3::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId"),
     "https://s3.us-east-2.amazonaws.com",
     "bucketname",
     "data",
@ -83,7 +83,7 @@ class S3UriTest : public testing::TestWithParam<std::string>
 TEST(S3UriTest, validPatterns)
 {
    {
-        S3::URI uri(Poco::URI("https://jokserfn.s3.amazonaws.com/"));
+        S3::URI uri("https://jokserfn.s3.amazonaws.com/");
        ASSERT_EQ("https://s3.amazonaws.com", uri.endpoint);
        ASSERT_EQ("jokserfn", uri.bucket);
        ASSERT_EQ("", uri.key);
@ -91,7 +91,7 @@ TEST(S3UriTest, validPatterns)
        ASSERT_EQ(true, uri.is_virtual_hosted_style);
    }
    {
-        S3::URI uri(Poco::URI("https://s3.amazonaws.com/jokserfn/"));
+        S3::URI uri("https://s3.amazonaws.com/jokserfn/");
        ASSERT_EQ("https://s3.amazonaws.com", uri.endpoint);
        ASSERT_EQ("jokserfn", uri.bucket);
        ASSERT_EQ("", uri.key);
@ -99,7 +99,7 @@ TEST(S3UriTest, validPatterns)
        ASSERT_EQ(false, uri.is_virtual_hosted_style);
    }
    {
-        S3::URI uri(Poco::URI("https://amazonaws.com/bucket/"));
+        S3::URI uri("https://amazonaws.com/bucket/");
        ASSERT_EQ("https://amazonaws.com", uri.endpoint);
        ASSERT_EQ("bucket", uri.bucket);
        ASSERT_EQ("", uri.key);
@ -107,7 +107,7 @@ TEST(S3UriTest, validPatterns)
        ASSERT_EQ(false, uri.is_virtual_hosted_style);
    }
    {
-        S3::URI uri(Poco::URI("https://jokserfn.s3.amazonaws.com/data"));
+        S3::URI uri("https://jokserfn.s3.amazonaws.com/data");
        ASSERT_EQ("https://s3.amazonaws.com", uri.endpoint);
        ASSERT_EQ("jokserfn", uri.bucket);
        ASSERT_EQ("data", uri.key);
@ -115,7 +115,7 @@ TEST(S3UriTest, validPatterns)
        ASSERT_EQ(true, uri.is_virtual_hosted_style);
    }
    {
-        S3::URI uri(Poco::URI("https://storage.amazonaws.com/jokserfn/data"));
+        S3::URI uri("https://storage.amazonaws.com/jokserfn/data");
        ASSERT_EQ("https://storage.amazonaws.com", uri.endpoint);
        ASSERT_EQ("jokserfn", uri.bucket);
        ASSERT_EQ("data", uri.key);
@ -123,7 +123,7 @@ TEST(S3UriTest, validPatterns)
        ASSERT_EQ(false, uri.is_virtual_hosted_style);
    }
    {
-        S3::URI uri(Poco::URI("https://bucketname.cos.ap-beijing.myqcloud.com/data"));
+        S3::URI uri("https://bucketname.cos.ap-beijing.myqcloud.com/data");
        ASSERT_EQ("https://cos.ap-beijing.myqcloud.com", uri.endpoint);
        ASSERT_EQ("bucketname", uri.bucket);
        ASSERT_EQ("data", uri.key);
@ -131,7 +131,7 @@ TEST(S3UriTest, validPatterns)
        ASSERT_EQ(true, uri.is_virtual_hosted_style);
    }
    {
-        S3::URI uri(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data"));
+        S3::URI uri("https://bucketname.s3.us-east-2.amazonaws.com/data");
        ASSERT_EQ("https://s3.us-east-2.amazonaws.com", uri.endpoint);
        ASSERT_EQ("bucketname", uri.bucket);
        ASSERT_EQ("data", uri.key);
@ -139,7 +139,7 @@ TEST(S3UriTest, validPatterns)
        ASSERT_EQ(true, uri.is_virtual_hosted_style);
    }
    {
-        S3::URI uri(Poco::URI("https://s3.us-east-2.amazonaws.com/bucketname/data"));
+        S3::URI uri("https://s3.us-east-2.amazonaws.com/bucketname/data");
        ASSERT_EQ("https://s3.us-east-2.amazonaws.com", uri.endpoint);
        ASSERT_EQ("bucketname", uri.bucket);
        ASSERT_EQ("data", uri.key);
@ -147,7 +147,7 @@ TEST(S3UriTest, validPatterns)
        ASSERT_EQ(false, uri.is_virtual_hosted_style);
    }
    {
-        S3::URI uri(Poco::URI("https://bucketname.s3-us-east-2.amazonaws.com/data"));
+        S3::URI uri("https://bucketname.s3-us-east-2.amazonaws.com/data");
        ASSERT_EQ("https://s3-us-east-2.amazonaws.com", uri.endpoint);
        ASSERT_EQ("bucketname", uri.bucket);
        ASSERT_EQ("data", uri.key);
@ -155,7 +155,7 @@ TEST(S3UriTest, validPatterns)
        ASSERT_EQ(true, uri.is_virtual_hosted_style);
    }
    {
-        S3::URI uri(Poco::URI("https://s3-us-east-2.amazonaws.com/bucketname/data"));
+        S3::URI uri("https://s3-us-east-2.amazonaws.com/bucketname/data");
        ASSERT_EQ("https://s3-us-east-2.amazonaws.com", uri.endpoint);
        ASSERT_EQ("bucketname", uri.bucket);
        ASSERT_EQ("data", uri.key);
@ -166,7 +166,7 @@ TEST(S3UriTest, validPatterns)

 TEST_P(S3UriTest, invalidPatterns)
 {
-    ASSERT_ANY_THROW(S3::URI(Poco::URI(GetParam())));
+    ASSERT_ANY_THROW(S3::URI new_uri(GetParam()));
 }

 TEST(S3UriTest, versionIdChecks)
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -24,6 +24,7 @@
 #include <Storages/IStorage.h>
 #include <Storages/MarkCache.h>
 #include <Storages/MergeTree/MergeList.h>
+#include <Storages/MergeTree/MovesList.h>
 #include <Storages/MergeTree/ReplicatedFetchList.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/MergeTreeSettings.h>
@ -229,6 +230,7 @@ struct ContextSharedPart : boost::noncopyable
    ProcessList process_list;                               /// Executing queries at the moment.
    GlobalOvercommitTracker global_overcommit_tracker;
    MergeList merge_list;                                   /// The list of executable merge (for (Replicated)?MergeTree)
+    MovesList moves_list;                                   /// The list of executing moves (for (Replicated)?MergeTree)
    ReplicatedFetchList replicated_fetch_list;
    ConfigurationPtr users_config;                          /// Config with the users, profiles and quotas sections.
    InterserverIOHandler interserver_io_handler;            /// Handler for interserver communication.
@ -637,6 +639,8 @@ const ProcessList & Context::getProcessList() const { return shared->process_lis
 OvercommitTracker * Context::getGlobalOvercommitTracker() const { return &shared->global_overcommit_tracker; }
 MergeList & Context::getMergeList() { return shared->merge_list; }
 const MergeList & Context::getMergeList() const { return shared->merge_list; }
+MovesList & Context::getMovesList() { return shared->moves_list; }
+const MovesList & Context::getMovesList() const { return shared->moves_list; }
 ReplicatedFetchList & Context::getReplicatedFetchList() { return shared->replicated_fetch_list; }
 const ReplicatedFetchList & Context::getReplicatedFetchList() const { return shared->replicated_fetch_list; }

--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -63,6 +63,7 @@ using InterserverCredentialsPtr = std::shared_ptr<const InterserverCredentials>;
 class InterserverIOHandler;
 class BackgroundSchedulePool;
 class MergeList;
+class MovesList;
 class ReplicatedFetchList;
 class Cluster;
 class Compiler;
@ -775,6 +776,9 @@ public:
    MergeList & getMergeList();
    const MergeList & getMergeList() const;

+    MovesList & getMovesList();
+    const MovesList & getMovesList() const;
+
    ReplicatedFetchList & getReplicatedFetchList();
    const ReplicatedFetchList & getReplicatedFetchList() const;

--- a/src/Interpreters/InterpreterExplainQuery.cpp
+++ b/src/Interpreters/InterpreterExplainQuery.cpp
@ -165,7 +165,7 @@ struct QueryASTSettings

 struct QueryTreeSettings
 {
-    bool run_passes = false;
+    bool run_passes = true;
    bool dump_passes = false;
    bool dump_ast = false;
    Int64 passes = -1;
--- a/src/Interpreters/PartLog.cpp
+++ b/src/Interpreters/PartLog.cpp
@ -6,6 +6,7 @@
 #include <DataTypes/DataTypeDate.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypeEnum.h>
+#include <DataTypes/DataTypeUUID.h>
 #include <Storages/MergeTree/IMergeTreeDataPart.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Interpreters/PartLog.h>
@ -100,6 +101,7 @@ NamesAndTypesList PartLogElement::getNamesAndTypes()

        {"database", std::make_shared<DataTypeString>()},
        {"table", std::make_shared<DataTypeString>()},
+        {"table_uuid", std::make_shared<DataTypeUUID>()},
        {"part_name", std::make_shared<DataTypeString>()},
        {"partition_id", std::make_shared<DataTypeString>()},
        {"part_type", std::make_shared<DataTypeString>()},
@ -137,6 +139,7 @@ void PartLogElement::appendToBlock(MutableColumns & columns) const

    columns[i++]->insert(database_name);
    columns[i++]->insert(table_name);
+    columns[i++]->insert(table_uuid);
    columns[i++]->insert(part_name);
    columns[i++]->insert(partition_id);
    columns[i++]->insert(part_type.toString());
@ -205,6 +208,7 @@ bool PartLog::addNewParts(

            elem.database_name = table_id.database_name;
            elem.table_name = table_id.table_name;
+            elem.table_uuid = table_id.uuid;
            elem.partition_id = part->info.partition_id;
            elem.part_name = part->name;
            elem.disk_name = part->getDataPartStorage().getDiskName();
--- a/src/Interpreters/PartLog.h
+++ b/src/Interpreters/PartLog.h
@ -4,6 +4,7 @@
 #include <Interpreters/SystemLog.h>
 #include <Core/NamesAndTypes.h>
 #include <Core/NamesAndAliases.h>
+#include <Core/UUID.h>
 #include <Storages/MergeTree/MergeType.h>
 #include <Storages/MergeTree/MergeAlgorithm.h>

@ -55,6 +56,7 @@ struct PartLogElement

    String database_name;
    String table_name;
+    UUID table_uuid{UUIDHelpers::Nil};
    String part_name;
    String partition_id;
    String disk_name;
--- a/src/Interpreters/ThreadStatusExt.cpp
+++ b/src/Interpreters/ThreadStatusExt.cpp
@ -131,6 +131,12 @@ void ThreadStatus::setupState(const ThreadGroupStatusPtr & thread_group_)
    thread_state = ThreadState::AttachedToQuery;
 }

+void ThreadStatus::setInternalThread()
+{
+    chassert(!query_profiler_real && !query_profiler_cpu);
+    internal_thread = true;
+}
+
 void ThreadStatus::initializeQuery()
 {
    setupState(std::make_shared<ThreadGroupStatus>());
@ -177,41 +183,44 @@ void ThreadStatus::initPerformanceCounters()
    // query_start_time_nanoseconds cannot be used here since RUsageCounters expect CLOCK_MONOTONIC
    *last_rusage = RUsageCounters::current();

-    if (auto query_context_ptr = query_context.lock())
+    if (!internal_thread)
    {
-        const Settings & settings = query_context_ptr->getSettingsRef();
-        if (settings.metrics_perf_events_enabled)
+        if (auto query_context_ptr = query_context.lock())
+        {
+            const Settings & settings = query_context_ptr->getSettingsRef();
+            if (settings.metrics_perf_events_enabled)
+            {
+                try
+                {
+                    current_thread_counters.initializeProfileEvents(
+                        settings.metrics_perf_events_list);
+                }
+                catch (...)
+                {
+                    tryLogCurrentException(__PRETTY_FUNCTION__);
+                }
+            }
+        }
+
+        if (!taskstats)
        {
            try
            {
-                current_thread_counters.initializeProfileEvents(
-                    settings.metrics_perf_events_list);
+                taskstats = TasksStatsCounters::create(thread_id);
            }
            catch (...)
            {
-                tryLogCurrentException(__PRETTY_FUNCTION__);
+                tryLogCurrentException(log);
            }
        }
+        if (taskstats)
+            taskstats->reset();
    }
-
-    if (!taskstats)
-    {
-        try
-        {
-            taskstats = TasksStatsCounters::create(thread_id);
-        }
-        catch (...)
-        {
-            tryLogCurrentException(log);
-        }
-    }
-    if (taskstats)
-        taskstats->reset();
 }

 void ThreadStatus::finalizePerformanceCounters()
 {
-    if (performance_counters_finalized)
+    if (performance_counters_finalized || internal_thread)
        return;

    performance_counters_finalized = true;
@ -270,7 +279,7 @@ void ThreadStatus::resetPerformanceCountersLastUsage()

 void ThreadStatus::initQueryProfiler()
 {
-    if (!query_profiler_enabled)
+    if (internal_thread)
        return;

    /// query profilers are useless without trace collector
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@ -1,8 +1,8 @@
 #include <algorithm>
 #include <memory>
+
 #include <Core/Settings.h>
 #include <Core/NamesAndTypes.h>
-
 #include <Core/SettingsEnums.h>

 #include <Interpreters/ArrayJoinedColumnsVisitor.h>
@ -45,10 +45,10 @@
 #include <DataTypes/NestedUtils.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/DataTypeLowCardinality.h>
-#include <DataTypes/DataTypesNumber.h>

 #include <IO/WriteHelpers.h>
 #include <Storages/IStorage.h>
+#include <Common/checkStackSize.h>

 #include <AggregateFunctions/AggregateFunctionFactory.h>

@ -784,6 +784,67 @@ void collectJoinedColumns(TableJoin & analyzed_join, ASTTableJoin & table_join,
    }
 }

+std::pair<bool, UInt64> recursivelyCollectMaxOrdinaryExpressions(const ASTPtr & expr, ASTExpressionList & into)
+{
+    checkStackSize();
+
+    if (expr->as<ASTIdentifier>())
+    {
+        into.children.push_back(expr);
+        return {false, 1};
+    }
+
+    auto * function = expr->as<ASTFunction>();
+
+    if (!function)
+        return {false, 0};
+
+    if (AggregateUtils::isAggregateFunction(*function))
+        return {true, 0};
+
+    UInt64 pushed_children = 0;
+    bool has_aggregate = false;
+
+    for (const auto & child : function->arguments->children)
+    {
+        auto [child_has_aggregate, child_pushed_children] = recursivelyCollectMaxOrdinaryExpressions(child, into);
+        has_aggregate |= child_has_aggregate;
+        pushed_children += child_pushed_children;
+    }
+
+    /// The current function is not aggregate function and there is no aggregate function in its arguments,
+    /// so use the current function to replace its arguments
+    if (!has_aggregate)
+    {
+        for (UInt64 i = 0; i < pushed_children; i++)
+            into.children.pop_back();
+
+        into.children.push_back(expr);
+        pushed_children = 1;
+    }
+
+    return {has_aggregate, pushed_children};
+}
+
+/** Expand GROUP BY ALL by extracting all the SELECT-ed expressions that are not aggregate functions.
+  *
+  * For a special case that if there is a function having both aggregate functions and other fields as its arguments,
+  * the `GROUP BY` keys will contain the maximum non-aggregate fields we can extract from it.
+  *
+  * Example:
+  * SELECT substring(a, 4, 2), substring(substring(a, 1, 2), 1, count(b)) FROM t GROUP BY ALL
+  * will expand as
+  * SELECT substring(a, 4, 2), substring(substring(a, 1, 2), 1, count(b)) FROM t GROUP BY substring(a, 4, 2), substring(a, 1, 2)
+  */
+void expandGroupByAll(ASTSelectQuery * select_query)
+{
+    auto group_expression_list = std::make_shared<ASTExpressionList>();
+
+    for (const auto & expr : select_query->select()->children)
+        recursivelyCollectMaxOrdinaryExpressions(expr, *group_expression_list);
+
+    select_query->setExpression(ASTSelectQuery::Expression::GROUP_BY, group_expression_list);
+}

 std::vector<const ASTFunction *> getAggregates(ASTPtr & query, const ASTSelectQuery & select_query)
 {
@ -1276,6 +1337,10 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(

    normalize(query, result.aliases, all_source_columns_set, select_options.ignore_alias, settings, /* allow_self_aliases = */ true, getContext());

+    // expand GROUP BY ALL
+    if (select_query->group_by_all)
+        expandGroupByAll(select_query);
+
    /// Remove unneeded columns according to 'required_result_columns'.
    /// Leave all selected columns in case of DISTINCT; columns that contain arrayJoin function inside.
    /// Must be after 'normalizeTree' (after expanding aliases, for aliases not get lost)
--- a/src/Parsers/ASTSelectQuery.cpp
+++ b/src/Parsers/ASTSelectQuery.cpp
@ -93,7 +93,7 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F
        where()->formatImpl(s, state, frame);
    }

-    if (groupBy())
+    if (!group_by_all && groupBy())
    {
        s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str << "GROUP BY" << (s.hilite ? hilite_none : "");
        if (!group_by_with_grouping_sets)
@ -104,6 +104,9 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F
        }
    }

+    if (group_by_all)
+        s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str << "GROUP BY ALL" << (s.hilite ? hilite_none : "");
+
    if (group_by_with_rollup)
        s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str << (s.one_line ? "" : "    ") << "WITH ROLLUP" << (s.hilite ? hilite_none : "");

--- a/src/Parsers/ASTSelectQuery.h
+++ b/src/Parsers/ASTSelectQuery.h
@ -82,6 +82,7 @@ public:
    ASTPtr clone() const override;

    bool distinct = false;
+    bool group_by_all = false;
    bool group_by_with_totals = false;
    bool group_by_with_rollup = false;
    bool group_by_with_cube = false;
--- a/src/Parsers/ParserSelectQuery.cpp
+++ b/src/Parsers/ParserSelectQuery.cpp
@ -195,6 +195,8 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
            select_query->group_by_with_cube = true;
        else if (s_grouping_sets.ignore(pos, expected))
            select_query->group_by_with_grouping_sets = true;
+        else if (s_all.ignore(pos, expected))
+            select_query->group_by_all = true;

        if ((select_query->group_by_with_rollup || select_query->group_by_with_cube || select_query->group_by_with_grouping_sets) &&
            !open_bracket.ignore(pos, expected))
@ -205,7 +207,7 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
            if (!grouping_sets_list.parse(pos, group_expression_list, expected))
                return false;
        }
-        else
+        else if (!select_query->group_by_all)
        {
            if (!exp_list.parse(pos, group_expression_list, expected))
                return false;
--- a/src/Planner/PlannerJoins.cpp
+++ b/src/Planner/PlannerJoins.cpp
@ -87,8 +87,8 @@ void JoinClause::dump(WriteBuffer & buffer) const
        {
            const auto & asof_condition = asof_conditions[i];

-            buffer << "key_index: " << asof_condition.key_index;
-            buffer << "inequality: " << toString(asof_condition.asof_inequality);
+            buffer << " key_index: " << asof_condition.key_index;
+            buffer << " inequality: " << toString(asof_condition.asof_inequality);

            if (i + 1 != asof_conditions_size)
                buffer << ',';
--- a/src/Planner/TableExpressionData.h
+++ b/src/Planner/TableExpressionData.h
@ -183,19 +183,19 @@ public:
    }

 private:
-    /// Valid for table, table function, query, union, array join table expression nodes
+    /// Valid for table, table function, array join, query, union nodes
    NamesAndTypesList columns;

-    /// Valid for table, table function, query, union, array join table expression nodes
+    /// Valid for table, table function, array join, query, union nodes
    NameSet columns_names;

-    /// Valid only for table table expression node
+    /// Valid only for table node
    NameSet alias_columns_names;

-    /// Valid for table, table function, query, union table, array join expression nodes
+    /// Valid for table, table function, array join, query, union nodes
    ColumnNameToColumnIdentifier column_name_to_column_identifier;

-    /// Valid for table, table function, query, union table, array join expression nodes
+    /// Valid for table, table function, array join, query, union nodes
    ColumnIdentifierToColumnName column_identifier_to_column_name;

    /// Is storage remote
--- a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp
@ -0,0 +1,978 @@
+#include <IO/ReadBufferFromString.h>
+
+#include <Formats/FormatFactory.h>
+#include <Formats/FormatSettings.h>
+#include <Formats/BSONTypes.h>
+#include <Formats/EscapingRuleUtils.h>
+#include <Processors/Formats/Impl/BSONEachRowRowInputFormat.h>
+#include <IO/ReadHelpers.h>
+
+#include <Columns/ColumnsNumber.h>
+#include <Columns/ColumnNullable.h>
+#include <Columns/ColumnLowCardinality.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnFixedString.h>
+#include <Columns/ColumnDecimal.h>
+#include <Columns/ColumnArray.h>
+#include <Columns/ColumnTuple.h>
+#include <Columns/ColumnMap.h>
+
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypeUUID.h>
+#include <DataTypes/DataTypeDateTime64.h>
+#include <DataTypes/DataTypeLowCardinality.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/DataTypeMap.h>
+#include <DataTypes/DataTypeFactory.h>
+#include <DataTypes/getLeastSupertype.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int INCORRECT_DATA;
+    extern const int ILLEGAL_COLUMN;
+    extern const int TOO_LARGE_STRING_SIZE;
+    extern const int UNKNOWN_TYPE;
+}
+
+namespace
+{
+    enum
+    {
+        UNKNOWN_FIELD = size_t(-1),
+    };
+}
+
+BSONEachRowRowInputFormat::BSONEachRowRowInputFormat(
+    ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_)
+    : IRowInputFormat(header_, in_, std::move(params_))
+    , format_settings(format_settings_)
+    , name_map(header_.getNamesToIndexesMap())
+    , prev_positions(header_.columns())
+    , types(header_.getDataTypes())
+{
+}
+
+inline size_t BSONEachRowRowInputFormat::columnIndex(const StringRef & name, size_t key_index)
+{
+    /// Optimization by caching the order of fields (which is almost always the same)
+    /// and a quick check to match the next expected field, instead of searching the hash table.
+
+    if (prev_positions.size() > key_index && prev_positions[key_index] && name == prev_positions[key_index]->getKey())
+    {
+        return prev_positions[key_index]->getMapped();
+    }
+    else
+    {
+        auto * it = name_map.find(name);
+
+        if (it)
+        {
+            if (key_index < prev_positions.size())
+                prev_positions[key_index] = it;
+
+            return it->getMapped();
+        }
+        else
+            return UNKNOWN_FIELD;
+    }
+}
+
+/// Read the field name. Resulting StringRef is valid only before next read from buf.
+static StringRef readBSONKeyName(ReadBuffer & in, String & key_holder)
+{
+    // This is just an optimization: try to avoid copying the name into key_holder
+
+    if (!in.eof())
+    {
+        char * next_pos = find_first_symbols<0>(in.position(), in.buffer().end());
+
+        if (next_pos != in.buffer().end())
+        {
+            StringRef res(in.position(), next_pos - in.position());
+            in.position() = next_pos + 1;
+            return res;
+        }
+    }
+
+    key_holder.clear();
+    readNullTerminated(key_holder, in);
+    return key_holder;
+}
+
+static UInt8 readBSONType(ReadBuffer & in)
+{
+    UInt8 type;
+    readBinary(type, in);
+    return type;
+}
+
+static size_t readBSONSize(ReadBuffer & in)
+{
+    BSONSizeT size;
+    readBinary(size, in);
+    return size;
+}
+
+template <typename T>
+static void readAndInsertInteger(ReadBuffer & in, IColumn & column, const DataTypePtr & data_type, BSONType bson_type)
+{
+    /// We allow to read any integer into any integer column.
+    /// For example we can read BSON Int32 into ClickHouse UInt8.
+
+    if (bson_type == BSONType::INT32)
+    {
+        UInt32 value;
+        readBinary(value, in);
+        assert_cast<ColumnVector<T> &>(column).insertValue(static_cast<T>(value));
+    }
+    else if (bson_type == BSONType::INT64)
+    {
+        UInt64 value;
+        readBinary(value, in);
+        assert_cast<ColumnVector<T> &>(column).insertValue(static_cast<T>(value));
+    }
+    else if (bson_type == BSONType::BOOL)
+    {
+        UInt8 value;
+        readBinary(value, in);
+        assert_cast<ColumnVector<T> &>(column).insertValue(static_cast<T>(value));
+    }
+    else
+    {
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert BSON {} into column with type {}", getBSONTypeName(bson_type), data_type->getName());
+    }
+}
+
+template <typename T>
+static void readAndInsertDouble(ReadBuffer & in, IColumn & column, const DataTypePtr & data_type, BSONType bson_type)
+{
+    if (bson_type != BSONType::DOUBLE)
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert BSON {} into column with type {}", getBSONTypeName(bson_type), data_type->getName());
+
+    Float64 value;
+    readBinary(value, in);
+    assert_cast<ColumnVector<T> &>(column).insertValue(static_cast<T>(value));
+}
+
+template <typename DecimalType, BSONType expected_bson_type>
+static void readAndInsertSmallDecimal(ReadBuffer & in, IColumn & column, const DataTypePtr & data_type, BSONType bson_type)
+{
+    if (bson_type != expected_bson_type)
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert BSON {} into column with type {}", getBSONTypeName(bson_type), data_type->getName());
+
+    DecimalType value;
+    readBinary(value, in);
+    assert_cast<ColumnDecimal<DecimalType> &>(column).insertValue(value);
+}
+
+static void readAndInsertDateTime64(ReadBuffer & in, IColumn & column, BSONType bson_type)
+{
+    if (bson_type != BSONType::INT64 && bson_type != BSONType::DATETIME)
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert BSON {} into DateTime64 column", getBSONTypeName(bson_type));
+
+    DateTime64 value;
+    readBinary(value, in);
+    assert_cast<DataTypeDateTime64::ColumnType &>(column).insertValue(value);
+}
+
+template <typename ColumnType>
+static void readAndInsertBigInteger(ReadBuffer & in, IColumn & column, const DataTypePtr & data_type, BSONType bson_type)
+{
+    if (bson_type != BSONType::BINARY)
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert BSON {} into column with type {}", getBSONTypeName(bson_type), data_type->getName());
+
+    auto size = readBSONSize(in);
+    auto subtype = getBSONBinarySubtype(readBSONType(in));
+    if (subtype != BSONBinarySubtype::BINARY)
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert BSON Binary subtype {} into column with type {}", getBSONBinarySubtypeName(subtype), data_type->getName());
+
+    using ValueType = typename ColumnType::ValueType;
+
+    if (size != sizeof(ValueType))
+        throw Exception(
+            ErrorCodes::INCORRECT_DATA,
+            "Cannot parse value of type {}, size of binary data is not equal to the binary size of expected value: {} != {}",
+            data_type->getName(),
+            size,
+            sizeof(ValueType));
+
+    ValueType value;
+    readBinary(value, in);
+    assert_cast<ColumnType &>(column).insertValue(value);
+}
+
+template <bool is_fixed_string>
+static void readAndInsertStringImpl(ReadBuffer & in, IColumn & column, size_t size)
+{
+    if constexpr (is_fixed_string)
+    {
+        auto & fixed_string_column = assert_cast<ColumnFixedString &>(column);
+        size_t n = fixed_string_column.getN();
+        if (size > n)
+            throw Exception("Too large string for FixedString column", ErrorCodes::TOO_LARGE_STRING_SIZE);
+
+        auto & data = fixed_string_column.getChars();
+
+        size_t old_size = data.size();
+        data.resize_fill(old_size + n);
+
+        try
+        {
+            in.readStrict(reinterpret_cast<char *>(data.data() + old_size), size);
+        }
+        catch (...)
+        {
+            /// Restore column state in case of any exception.
+            data.resize_assume_reserved(old_size);
+            throw;
+        }
+    }
+    else
+    {
+        auto & column_string = assert_cast<ColumnString &>(column);
+        auto & data = column_string.getChars();
+        auto & offsets = column_string.getOffsets();
+
+        size_t old_chars_size = data.size();
+        size_t offset = old_chars_size + size + 1;
+        offsets.push_back(offset);
+
+        try
+        {
+            data.resize(offset);
+            in.readStrict(reinterpret_cast<char *>(&data[offset - size - 1]), size);
+            data.back() = 0;
+        }
+        catch (...)
+        {
+            /// Restore column state in case of any exception.
+            offsets.pop_back();
+            data.resize_assume_reserved(old_chars_size);
+            throw;
+        }
+    }
+}
+
+template <bool is_fixed_string>
+static void readAndInsertString(ReadBuffer & in, IColumn & column, BSONType bson_type)
+{
+    if (bson_type == BSONType::STRING || bson_type == BSONType::SYMBOL || bson_type == BSONType::JAVA_SCRIPT_CODE)
+    {
+        auto size = readBSONSize(in);
+        readAndInsertStringImpl<is_fixed_string>(in, column, size - 1);
+        assertChar(0, in);
+    }
+    else if (bson_type == BSONType::BINARY)
+    {
+        auto size = readBSONSize(in);
+        auto subtype = getBSONBinarySubtype(readBSONType(in));
+        if (subtype == BSONBinarySubtype::BINARY || subtype == BSONBinarySubtype::BINARY_OLD)
+            readAndInsertStringImpl<is_fixed_string>(in, column, size);
+        else
+            throw Exception(
+                ErrorCodes::ILLEGAL_COLUMN,
+                "Cannot insert BSON Binary subtype {} into String column",
+                getBSONBinarySubtypeName(subtype));
+    }
+    else if (bson_type == BSONType::OBJECT_ID)
+    {
+        readAndInsertStringImpl<is_fixed_string>(in, column, 12);
+    }
+    else
+    {
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert BSON {} into String column", getBSONTypeName(bson_type));
+    }
+}
+
+static void readAndInsertUUID(ReadBuffer & in, IColumn & column, BSONType bson_type)
+{
+    if (bson_type == BSONType::BINARY)
+    {
+        auto size = readBSONSize(in);
+        auto subtype = getBSONBinarySubtype(readBSONType(in));
+        if (subtype == BSONBinarySubtype::UUID || subtype == BSONBinarySubtype::UUID_OLD)
+        {
+            if (size != sizeof(UUID))
+                throw Exception(
+                    ErrorCodes::INCORRECT_DATA,
+                    "Cannot parse value of type UUID, size of binary data is not equal to the binary size of UUID value: {} != {}",
+                    size,
+                    sizeof(UUID));
+
+            UUID value;
+            readBinary(value, in);
+            assert_cast<ColumnUUID &>(column).insertValue(value);
+        }
+        else
+        {
+            throw Exception(
+                ErrorCodes::ILLEGAL_COLUMN,
+                "Cannot insert BSON Binary subtype {} into UUID column",
+                getBSONBinarySubtypeName(subtype));
+        }
+    }
+    else
+    {
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert BSON {} into UUID column", getBSONTypeName(bson_type));
+    }
+}
+
+void BSONEachRowRowInputFormat::readArray(IColumn & column, const DataTypePtr & data_type, BSONType bson_type)
+{
+    if (bson_type != BSONType::ARRAY)
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert BSON {} into Array column", getBSONTypeName(bson_type));
+
+    const auto * data_type_array = assert_cast<const DataTypeArray *>(data_type.get());
+    const auto & nested_type = data_type_array->getNestedType();
+    auto & array_column = assert_cast<ColumnArray &>(column);
+    auto & nested_column = array_column.getData();
+
+    size_t document_start = in->count();
+    BSONSizeT document_size;
+    readBinary(document_size, *in);
+    while (in->count() - document_start + sizeof(BSON_DOCUMENT_END) != document_size)
+    {
+        auto nested_bson_type = getBSONType(readBSONType(*in));
+        readBSONKeyName(*in, current_key_name);
+        readField(nested_column, nested_type, nested_bson_type);
+    }
+
+    assertChar(BSON_DOCUMENT_END, *in);
+    array_column.getOffsets().push_back(array_column.getData().size());
+}
+
+void BSONEachRowRowInputFormat::readTuple(IColumn & column, const DataTypePtr & data_type, BSONType bson_type)
+{
+    if (bson_type != BSONType::ARRAY && bson_type != BSONType::DOCUMENT)
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert BSON {} into Tuple column", getBSONTypeName(bson_type));
+
+    /// When BSON type is ARRAY, names in nested document are not useful
+    /// (most likely they are just sequential numbers).
+    bool use_key_names = bson_type == BSONType::DOCUMENT;
+
+    const auto * data_type_tuple = assert_cast<const DataTypeTuple *>(data_type.get());
+    auto & tuple_column = assert_cast<ColumnTuple &>(column);
+    size_t read_nested_columns = 0;
+
+    size_t document_start = in->count();
+    BSONSizeT document_size;
+    readBinary(document_size, *in);
+    while (in->count() - document_start + sizeof(BSON_DOCUMENT_END) != document_size)
+    {
+        auto nested_bson_type = getBSONType(readBSONType(*in));
+        auto name = readBSONKeyName(*in, current_key_name);
+
+        size_t index = read_nested_columns;
+        if (use_key_names)
+        {
+            auto try_get_index = data_type_tuple->tryGetPositionByName(name.toString());
+            if (!try_get_index)
+                throw Exception(
+                    ErrorCodes::INCORRECT_DATA,
+                    "Cannot parse tuple column with type {} from BSON array/embedded document field: tuple doesn't have element with name \"{}\"",
+                    data_type->getName(),
+                    name);
+            index = *try_get_index;
+        }
+
+        if (index >= data_type_tuple->getElements().size())
+            throw Exception(
+                ErrorCodes::INCORRECT_DATA,
+                "Cannot parse tuple column with type {} from BSON array/embedded document field: the number of fields BSON document exceeds the number of fields in tuple",
+                data_type->getName());
+
+        readField(tuple_column.getColumn(index), data_type_tuple->getElement(index), nested_bson_type);
+        ++read_nested_columns;
+    }
+
+    assertChar(BSON_DOCUMENT_END, *in);
+
+    if (read_nested_columns != data_type_tuple->getElements().size())
+        throw Exception(
+            ErrorCodes::INCORRECT_DATA,
+            "Cannot parse tuple column with type {} from BSON array/embedded document field, the number of fields in tuple and BSON document doesn't match: {} != {}",
+            data_type->getName(),
+            data_type_tuple->getElements().size(),
+            read_nested_columns);
+}
+
+void BSONEachRowRowInputFormat::readMap(IColumn & column, const DataTypePtr & data_type, BSONType bson_type)
+{
+    if (bson_type != BSONType::DOCUMENT)
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert BSON {} into Map column", getBSONTypeName(bson_type));
+
+    const auto * data_type_map = assert_cast<const DataTypeMap *>(data_type.get());
+    const auto & key_data_type = data_type_map->getKeyType();
+    if (!isStringOrFixedString(key_data_type))
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Only maps with String key type are supported in BSON, got key type: {}", key_data_type->getName());
+
+    const auto & value_data_type = data_type_map->getValueType();
+    auto & column_map = assert_cast<ColumnMap &>(column);
+    auto & key_column = column_map.getNestedData().getColumn(0);
+    auto & value_column = column_map.getNestedData().getColumn(1);
+    auto & offsets = column_map.getNestedColumn().getOffsets();
+
+    size_t document_start = in->count();
+    BSONSizeT document_size;
+    readBinary(document_size, *in);
+    while (in->count() - document_start + sizeof(BSON_DOCUMENT_END) != document_size)
+    {
+        auto nested_bson_type = getBSONType(readBSONType(*in));
+        auto name = readBSONKeyName(*in, current_key_name);
+        key_column.insertData(name.data, name.size);
+        readField(value_column, value_data_type, nested_bson_type);
+    }
+
+    assertChar(BSON_DOCUMENT_END, *in);
+    offsets.push_back(key_column.size());
+}
+
+
+bool BSONEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr & data_type, BSONType bson_type)
+{
+    if (bson_type == BSONType::NULL_VALUE)
+    {
+        if (data_type->isNullable())
+        {
+            column.insertDefault();
+            return true;
+        }
+
+        if (!format_settings.null_as_default)
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert BSON Null value into non-nullable column with type {}", getBSONTypeName(bson_type), data_type->getName());
+
+        column.insertDefault();
+        return false;
+    }
+
+    switch (data_type->getTypeId())
+    {
+        case TypeIndex::Nullable:
+        {
+            auto & nullable_column = assert_cast<ColumnNullable &>(column);
+            auto & nested_column = nullable_column.getNestedColumn();
+            const auto & nested_type = assert_cast<const DataTypeNullable *>(data_type.get())->getNestedType();
+            nullable_column.getNullMapColumn().insertValue(0);
+            return readField(nested_column, nested_type, bson_type);
+        }
+        case TypeIndex::LowCardinality:
+        {
+            auto & lc_column = assert_cast<ColumnLowCardinality &>(column);
+            auto tmp_column = lc_column.getDictionary().getNestedColumn()->cloneEmpty();
+            const auto & dict_type = assert_cast<const DataTypeLowCardinality *>(data_type.get())->getDictionaryType();
+            auto res = readField(*tmp_column, dict_type, bson_type);
+            lc_column.insertFromFullColumn(*tmp_column, 0);
+            return res;
+        }
+        case TypeIndex::Int8:
+        {
+            readAndInsertInteger<Int8>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::UInt8:
+        {
+            readAndInsertInteger<UInt8>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Int16:
+        {
+            readAndInsertInteger<Int16>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Date: [[fallthrough]];
+        case TypeIndex::UInt16:
+        {
+            readAndInsertInteger<UInt16>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Date32: [[fallthrough]];
+        case TypeIndex::Int32:
+        {
+            readAndInsertInteger<Int32>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::DateTime: [[fallthrough]];
+        case TypeIndex::UInt32:
+        {
+            readAndInsertInteger<UInt32>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Int64:
+        {
+            readAndInsertInteger<Int64>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::UInt64:
+        {
+            readAndInsertInteger<UInt64>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Int128:
+        {
+            readAndInsertBigInteger<ColumnInt128>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::UInt128:
+        {
+            readAndInsertBigInteger<ColumnUInt128>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Int256:
+        {
+            readAndInsertBigInteger<ColumnInt256>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::UInt256:
+        {
+            readAndInsertBigInteger<ColumnUInt256>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Float32:
+        {
+            readAndInsertDouble<Float32>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Float64:
+        {
+            readAndInsertDouble<Float64>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Decimal32:
+        {
+            readAndInsertSmallDecimal<Decimal32, BSONType::INT32>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Decimal64:
+        {
+            readAndInsertSmallDecimal<Decimal64, BSONType::INT64>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Decimal128:
+        {
+            readAndInsertBigInteger<ColumnDecimal<Decimal128>>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Decimal256:
+        {
+            readAndInsertBigInteger<ColumnDecimal<Decimal256>>(*in, column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::DateTime64:
+        {
+            readAndInsertDateTime64(*in, column, bson_type);
+            return true;
+        }
+        case TypeIndex::FixedString:
+        {
+            readAndInsertString<true>(*in, column, bson_type);
+            return true;
+        }
+        case TypeIndex::String:
+        {
+            readAndInsertString<false>(*in, column, bson_type);
+            return true;
+        }
+        case TypeIndex::UUID:
+        {
+            readAndInsertUUID(*in, column, bson_type);
+            return true;
+        }
+        case TypeIndex::Array:
+        {
+            readArray(column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Tuple:
+        {
+            readTuple(column, data_type, bson_type);
+            return true;
+        }
+        case TypeIndex::Map:
+        {
+            readMap(column, data_type, bson_type);
+            return true;
+        }
+        default:
+        {
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not supported for output in BSON format", data_type->getName());
+        }
+    }
+}
+
+static void skipBSONField(ReadBuffer & in, BSONType type)
+{
+    switch (type)
+    {
+        case BSONType::DOUBLE:
+        {
+            in.ignore(sizeof(Float64));
+            break;
+        }
+        case BSONType::BOOL:
+        {
+            in.ignore(sizeof(UInt8));
+            break;
+        }
+        case BSONType::INT64: [[fallthrough]];
+        case BSONType::DATETIME: [[fallthrough]];
+        case BSONType::TIMESTAMP:
+        {
+            in.ignore(sizeof(UInt64));
+            break;
+        }
+        case BSONType::INT32:
+        {
+            in.ignore(sizeof(Int32));
+            break;
+        }
+        case BSONType::JAVA_SCRIPT_CODE: [[fallthrough]];
+        case BSONType::SYMBOL: [[fallthrough]];
+        case BSONType::STRING:
+        {
+            BSONSizeT size;
+            readBinary(size, in);
+            in.ignore(size);
+            break;
+        }
+        case BSONType::DOCUMENT: [[fallthrough]];
+        case BSONType::ARRAY:
+        {
+            BSONSizeT size;
+            readBinary(size, in);
+            in.ignore(size - sizeof(size));
+            break;
+        }
+        case BSONType::BINARY:
+        {
+            BSONSizeT size;
+            readBinary(size, in);
+            in.ignore(size + 1);
+            break;
+        }
+        case BSONType::MIN_KEY: [[fallthrough]];
+        case BSONType::MAX_KEY: [[fallthrough]];
+        case BSONType::UNDEFINED: [[fallthrough]];
+        case BSONType::NULL_VALUE:
+        {
+            break;
+        }
+        case BSONType::OBJECT_ID:
+        {
+            in.ignore(12);
+            break;
+        }
+        case BSONType::REGEXP:
+        {
+            skipNullTerminated(in);
+            skipNullTerminated(in);
+            break;
+        }
+        case BSONType::DB_POINTER:
+        {
+            BSONSizeT size;
+            readBinary(size, in);
+            in.ignore(size + 12);
+            break;
+        }
+        case BSONType::JAVA_SCRIPT_CODE_W_SCOPE:
+        {
+            BSONSizeT size;
+            readBinary(size, in);
+            in.ignore(size - sizeof(size));
+            break;
+        }
+        case BSONType::DECIMAL128:
+        {
+            in.ignore(16);
+            break;
+        }
+    }
+}
+
+void BSONEachRowRowInputFormat::skipUnknownField(BSONType type, const String & key_name)
+{
+    if (!format_settings.skip_unknown_fields)
+        throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown field found while parsing BSONEachRow format: {}", key_name);
+
+    skipBSONField(*in, type);
+}
+
+void BSONEachRowRowInputFormat::syncAfterError()
+{
+    /// Skip all remaining bytes in current document
+    size_t already_read_bytes = in->count() - current_document_start;
+    in->ignore(current_document_size - already_read_bytes);
+}
+
+bool BSONEachRowRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext)
+{
+    size_t num_columns = columns.size();
+
+    read_columns.assign(num_columns, false);
+    seen_columns.assign(num_columns, false);
+
+    if (in->eof())
+        return false;
+
+    size_t key_index = 0;
+
+    current_document_start = in->count();
+    readBinary(current_document_size, *in);
+    while (in->count() - current_document_start + sizeof(BSON_DOCUMENT_END) != current_document_size)
+    {
+        auto type = getBSONType(readBSONType(*in));
+        auto name = readBSONKeyName(*in, current_key_name);
+        auto index = columnIndex(name, key_index);
+
+        if (index == UNKNOWN_FIELD)
+        {
+            current_key_name.assign(name.data, name.size);
+            skipUnknownField(BSONType(type), current_key_name);
+        }
+        else
+        {
+            seen_columns[index] = true;
+            read_columns[index] = readField(*columns[index], types[index], BSONType(type));
+        }
+
+        ++key_index;
+    }
+
+    assertChar(BSON_DOCUMENT_END, *in);
+
+    const auto & header = getPort().getHeader();
+    /// Fill non-visited columns with the default values.
+    for (size_t i = 0; i < num_columns; ++i)
+        if (!seen_columns[i])
+            header.getByPosition(i).type->insertDefaultInto(*columns[i]);
+
+    if (format_settings.defaults_for_omitted_fields)
+        ext.read_columns = read_columns;
+    else
+        ext.read_columns.assign(read_columns.size(), true);
+
+    return true;
+}
+
+BSONEachRowSchemaReader::BSONEachRowSchemaReader(ReadBuffer & in_, const FormatSettings & settings_)
+    : IRowWithNamesSchemaReader(in_, settings_)
+{
+}
+
+DataTypePtr BSONEachRowSchemaReader::getDataTypeFromBSONField(BSONType type, bool allow_to_skip_unsupported_types, bool & skip)
+{
+    switch (type)
+    {
+        case BSONType::DOUBLE:
+        {
+            in.ignore(sizeof(Float64));
+            return makeNullable(std::make_shared<DataTypeFloat64>());
+        }
+        case BSONType::BOOL:
+        {
+            in.ignore(sizeof(UInt8));
+            return makeNullable(DataTypeFactory::instance().get("Bool"));
+        }
+        case BSONType::INT64:
+        {
+            in.ignore(sizeof(Int64));
+            return makeNullable(std::make_shared<DataTypeInt64>());
+        }
+        case BSONType::DATETIME:
+        {
+            in.ignore(sizeof(Int64));
+            return makeNullable(std::make_shared<DataTypeDateTime64>(6, "UTC"));
+        }
+        case BSONType::INT32:
+        {
+            in.ignore(sizeof(Int32));
+            return makeNullable(std::make_shared<DataTypeInt32>());
+        }
+        case BSONType::SYMBOL: [[fallthrough]];
+        case BSONType::JAVA_SCRIPT_CODE: [[fallthrough]];
+        case BSONType::OBJECT_ID: [[fallthrough]];
+        case BSONType::STRING:
+        {
+            BSONSizeT size;
+            readBinary(size, in);
+            in.ignore(size);
+            return makeNullable(std::make_shared<DataTypeString>());
+        }
+        case BSONType::DOCUMENT:
+        {
+            auto nested_names_and_types = getDataTypesFromBSONDocument(false);
+            auto nested_types = nested_names_and_types.getTypes();
+            bool types_are_equal = true;
+            if (nested_types.empty() || !nested_types[0])
+                return nullptr;
+
+            for (size_t i = 1; i != nested_types.size(); ++i)
+            {
+                if (!nested_types[i])
+                    return nullptr;
+
+                types_are_equal &= nested_types[i]->equals(*nested_types[0]);
+            }
+
+            if (types_are_equal)
+                return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), nested_types[0]);
+
+            return std::make_shared<DataTypeTuple>(std::move(nested_types), nested_names_and_types.getNames());
+
+        }
+        case BSONType::ARRAY:
+        {
+            auto nested_types = getDataTypesFromBSONDocument(false).getTypes();
+            bool types_are_equal = true;
+            if (nested_types.empty() || !nested_types[0])
+                return nullptr;
+
+            for (size_t i = 1; i != nested_types.size(); ++i)
+            {
+                if (!nested_types[i])
+                    return nullptr;
+
+                types_are_equal &= nested_types[i]->equals(*nested_types[0]);
+            }
+
+            if (types_are_equal)
+                return std::make_shared<DataTypeArray>(nested_types[0]);
+
+            return std::make_shared<DataTypeTuple>(std::move(nested_types));
+        }
+        case BSONType::BINARY:
+        {
+            BSONSizeT size;
+            readBinary(size, in);
+            auto subtype = getBSONBinarySubtype(readBSONType(in));
+            in.ignore(size);
+            switch (subtype)
+            {
+                case BSONBinarySubtype::BINARY_OLD: [[fallthrough]];
+                case BSONBinarySubtype::BINARY:
+                    return makeNullable(std::make_shared<DataTypeString>());
+                case BSONBinarySubtype::UUID_OLD: [[fallthrough]];
+                case BSONBinarySubtype::UUID:
+                    return makeNullable(std::make_shared<DataTypeUUID>());
+                default:
+                    throw Exception(ErrorCodes::UNKNOWN_TYPE, "BSON binary subtype {} is not supported", getBSONBinarySubtypeName(subtype));
+            }
+        }
+        case BSONType::NULL_VALUE:
+        {
+            return nullptr;
+        }
+        default:
+        {
+            if (!allow_to_skip_unsupported_types)
+                throw Exception(ErrorCodes::UNKNOWN_TYPE, "BSON type {} is not supported", getBSONTypeName(type));
+
+            skip = true;
+            skipBSONField(in, type);
+            return nullptr;
+        }
+    }
+}
+
+NamesAndTypesList BSONEachRowSchemaReader::getDataTypesFromBSONDocument(bool allow_to_skip_unsupported_types)
+{
+    size_t document_start = in.count();
+    BSONSizeT document_size;
+    readBinary(document_size, in);
+    NamesAndTypesList names_and_types;
+    while (in.count() - document_start + sizeof(BSON_DOCUMENT_END) != document_size)
+    {
+        auto bson_type = getBSONType(readBSONType(in));
+        String name;
+        readNullTerminated(name, in);
+        bool skip = false;
+        auto type = getDataTypeFromBSONField(bson_type, allow_to_skip_unsupported_types, skip);
+        if (!skip)
+            names_and_types.emplace_back(name, type);
+    }
+
+    assertChar(BSON_DOCUMENT_END, in);
+
+    return names_and_types;
+}
+
+NamesAndTypesList BSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes(bool & eof)
+{
+    if (in.eof())
+    {
+        eof = true;
+        return {};
+    }
+
+    return getDataTypesFromBSONDocument(format_settings.bson.skip_fields_with_unsupported_types_in_schema_inference);
+}
+
+void BSONEachRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
+{
+    DataTypes types = {type, new_type};
+    /// For example for integer conversion Int32,
+    auto least_supertype = tryGetLeastSupertype(types);
+    if (least_supertype)
+        type = new_type = least_supertype;
+}
+
+static std::pair<bool, size_t>
+fileSegmentationEngineBSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t max_rows)
+{
+    size_t number_of_rows = 0;
+
+    while (!in.eof() && memory.size() < min_bytes && number_of_rows < max_rows)
+    {
+        BSONSizeT document_size;
+        readBinary(document_size, in);
+        if (min_bytes != 0 && document_size > 10 * min_bytes)
+            throw ParsingException(
+                ErrorCodes::INCORRECT_DATA,
+                "Size of BSON document is extremely large. Expected not greater than {} bytes, but current is {} bytes per row. Increase "
+                "the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely BSON is malformed",
+                min_bytes, document_size);
+
+        size_t old_size = memory.size();
+        memory.resize(old_size + document_size);
+        memcpy(memory.data() + old_size, reinterpret_cast<char *>(&document_size), sizeof(document_size));
+        in.readStrict(memory.data() + old_size + sizeof(document_size), document_size - sizeof(document_size));
+        ++number_of_rows;
+    }
+
+    return {!in.eof(), number_of_rows};
+}
+
+void registerInputFormatBSONEachRow(FormatFactory & factory)
+{
+    factory.registerInputFormat(
+        "BSONEachRow",
+        [](ReadBuffer & buf, const Block & sample, IRowInputFormat::Params params, const FormatSettings & settings)
+        { return std::make_shared<BSONEachRowRowInputFormat>(buf, sample, std::move(params), settings); });
+}
+
+void registerFileSegmentationEngineBSONEachRow(FormatFactory & factory)
+{
+    factory.registerFileSegmentationEngine("BSONEachRow", &fileSegmentationEngineBSONEachRow);
+}
+
+void registerBSONEachRowSchemaReader(FormatFactory & factory)
+{
+    factory.registerSchemaReader("BSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings)
+    {
+        return std::make_unique<BSONEachRowSchemaReader>(buf, settings);
+    });
+    factory.registerAdditionalInfoForSchemaCacheGetter("BSONEachRow", [](const FormatSettings & settings)
+    {
+         String result = getAdditionalFormatInfoForAllRowBasedFormats(settings);
+         return result + fmt::format(", skip_fields_with_unsupported_types_in_schema_inference={}",
+                                     settings.bson.skip_fields_with_unsupported_types_in_schema_inference);
+    });
+}
+
+}
--- a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.h
+++ b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.h
@ -0,0 +1,115 @@
+#pragma once
+
+#include <Core/Block.h>
+#include <Formats/FormatSettings.h>
+#include <Formats/BSONTypes.h>
+#include <Processors/Formats/IRowInputFormat.h>
+#include <Processors/Formats/ISchemaReader.h>
+#include <Common/HashTable/HashMap.h>
+
+
+namespace DB
+{
+
+/*
+ * Class for parsing data in BSON format.
+ * Each row is parsed as a separate BSON document.
+ * Each column is parsed as a single field with column name as a key.
+ * It uses the following correspondence between BSON types and ClickHouse types:
+ *
+ * BSON Type                                   | ClickHouse Type
+ * \x01 double                                 | Float32/Float64
+ * \x02 string                                 | String/FixedString
+ * \x03 document                               | Map/Named Tuple
+ * \x04 array                                  | Array/Tuple
+ * \x05 binary, \x00 binary subtype            | String/FixedString
+ * \x05 binary, \x02 old binary subtype        | String/FixedString
+ * \x05 binary, \x03 old uuid subtype          | UUID
+ * \x05 binary, \x04 uuid subtype              | UUID
+ * \x07 ObjectId                               | String
+ * \x08 boolean                                | Bool
+ * \x09 datetime                               | DateTime64
+ * \x0A null value                             | NULL
+ * \x0D JavaScript code                        | String
+ * \x0E symbol                                 | String/FixedString
+ * \x10 int32                                  | Int32/Decimal32
+ * \x12 int64                                  | Int64/Decimal64/DateTime64
+ * \x11 uint64                                 | UInt64
+ *
+ * Other BSON types are not supported.
+ * Also, we perform conversion between different integer types
+ * (for example, you can insert BSON int32 value into ClickHouse UInt8)
+ * Big integers and decimals Int128/UInt128/Int256/UInt256/Decimal128/Decimal256
+ * can be parsed from BSON Binary value with \x00 binary subtype. In this case
+ * we validate that the size of binary data equals the size of expected value.
+ *
+ * Note: this format will not work on Big-Endian platforms.
+ */
+
+class ReadBuffer;
+class BSONEachRowRowInputFormat final : public IRowInputFormat
+{
+public:
+    BSONEachRowRowInputFormat(
+        ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_);
+
+    String getName() const override { return "BSONEachRowRowInputFormat"; }
+    void resetParser() override { }
+
+private:
+    void readPrefix() override { }
+    void readSuffix() override { }
+
+    bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
+    bool allowSyncAfterError() const override { return true; }
+    void syncAfterError() override;
+
+    size_t columnIndex(const StringRef & name, size_t key_index);
+
+    using ColumnReader = std::function<void(StringRef name, BSONType type)>;
+
+    bool readField(IColumn & column, const DataTypePtr & data_type, BSONType bson_type);
+    void skipUnknownField(BSONType type, const String & key_name);
+
+    void readTuple(IColumn & column, const DataTypePtr & data_type, BSONType bson_type);
+    void readArray(IColumn & column, const DataTypePtr & data_type, BSONType bson_type);
+    void readMap(IColumn & column, const DataTypePtr & data_type, BSONType bson_type);
+
+    const FormatSettings format_settings;
+
+    /// Buffer for the read from the stream field name. Used when you have to copy it.
+    String current_key_name;
+
+    /// Set of columns for which the values were read. The rest will be filled with default values.
+    std::vector<UInt8> read_columns;
+    /// Set of columns which already met in row. Exception is thrown if there are more than one column with the same name.
+    std::vector<UInt8> seen_columns;
+    /// These sets may be different, because if null_as_default=1 read_columns[i] will be false and seen_columns[i] will be true
+    /// for row like {..., "non-nullable column name" : null, ...}
+
+    /// Hash table match `field name -> position in the block`.
+    Block::NameMap name_map;
+
+    /// Cached search results for previous row (keyed as index in JSON object) - used as a hint.
+    std::vector<Block::NameMap::LookupResult> prev_positions;
+
+    DataTypes types;
+
+    size_t current_document_start;
+    BSONSizeT current_document_size;
+};
+
+class BSONEachRowSchemaReader : public IRowWithNamesSchemaReader
+{
+public:
+    BSONEachRowSchemaReader(ReadBuffer & in_, const FormatSettings & settings_);
+
+private:
+    NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
+    void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
+
+    NamesAndTypesList getDataTypesFromBSONDocument(bool skip_unsupported_types);
+    DataTypePtr getDataTypeFromBSONField(BSONType type, bool skip_unsupported_types, bool & skip);
+};
+
+}
--- a/src/Processors/Formats/Impl/BSONEachRowRowOutputFormat.cpp
+++ b/src/Processors/Formats/Impl/BSONEachRowRowOutputFormat.cpp
@ -0,0 +1,527 @@
+#include <Processors/Formats/Impl/BSONEachRowRowOutputFormat.h>
+
+#include <Formats/FormatFactory.h>
+#include <Formats/BSONTypes.h>
+
+#include <Columns/ColumnArray.h>
+#include <Columns/ColumnNullable.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnFixedString.h>
+#include <Columns/ColumnLowCardinality.h>
+#include <Columns/ColumnTuple.h>
+#include <Columns/ColumnMap.h>
+#include <Columns/ColumnDecimal.h>
+
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeLowCardinality.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/DataTypeMap.h>
+
+#include <IO/WriteHelpers.h>
+#include <IO/WriteBufferValidUTF8.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int INCORRECT_DATA;
+    extern const int ILLEGAL_COLUMN;
+    extern const int LOGICAL_ERROR;
+}
+
+/// In BSON all names should be valid UTF8 sequences
+static String toValidUTF8String(const String & name)
+{
+    WriteBufferFromOwnString buf;
+    WriteBufferValidUTF8 validating_buf(buf);
+    writeString(name, validating_buf);
+    validating_buf.finalize();
+    return buf.str();
+}
+
+BSONEachRowRowOutputFormat::BSONEachRowRowOutputFormat(
+    WriteBuffer & out_, const Block & header_, const RowOutputFormatParams & params_, const FormatSettings & settings_)
+    : IRowOutputFormat(header_, out_, params_), settings(settings_)
+{
+    const auto & sample = getPort(PortKind::Main).getHeader();
+    fields.reserve(sample.columns());
+    for (const auto & field : sample.getNamesAndTypes())
+        fields.emplace_back(toValidUTF8String(field.name), field.type);
+}
+
+static void writeBSONSize(size_t size, WriteBuffer & buf)
+{
+    if (size > MAX_BSON_SIZE)
+        throw Exception(ErrorCodes::INCORRECT_DATA, "Too large document/value size: {}. Maximum allowed size: {}.", size, MAX_BSON_SIZE);
+
+    writePODBinary<BSONSizeT>(BSONSizeT(size), buf);
+}
+
+template <typename Type>
+static void writeBSONType(Type type, WriteBuffer & buf)
+{
+    UInt8 value = UInt8(type);
+    writeBinary(value, buf);
+}
+
+static void writeBSONTypeAndKeyName(BSONType type, const String & name, WriteBuffer & buf)
+{
+    writeBSONType(type, buf);
+    writeString(name, buf);
+    writeChar(0x00, buf);
+}
+
+template <typename ColumnType, typename ValueType>
+static void writeBSONNumber(BSONType type, const IColumn & column, size_t row_num, const String & name, WriteBuffer & buf)
+{
+    writeBSONTypeAndKeyName(type, name, buf);
+    writePODBinary<ValueType>(assert_cast<const ColumnType &>(column).getElement(row_num), buf);
+}
+
+template <typename StringColumnType>
+static void writeBSONString(const IColumn & column, size_t row_num, const String & name, WriteBuffer & buf, bool as_bson_string)
+{
+    const auto & string_column = assert_cast<const StringColumnType &>(column);
+    StringRef data = string_column.getDataAt(row_num);
+    if (as_bson_string)
+    {
+        writeBSONTypeAndKeyName(BSONType::STRING, name, buf);
+        writeBSONSize(data.size + 1, buf);
+        writeString(data, buf);
+        writeChar(0x00, buf);
+    }
+    else
+    {
+        writeBSONTypeAndKeyName(BSONType::BINARY, name, buf);
+        writeBSONSize(data.size, buf);
+        writeBSONType(BSONBinarySubtype::BINARY, buf);
+        writeString(data, buf);
+    }
+}
+
+template <class ColumnType>
+static void writeBSONBigInteger(const IColumn & column, size_t row_num, const String & name, WriteBuffer & buf)
+{
+    writeBSONTypeAndKeyName(BSONType::BINARY, name, buf);
+    writeBSONSize(sizeof(typename ColumnType::ValueType), buf);
+    writeBSONType(BSONBinarySubtype::BINARY, buf);
+    auto data = assert_cast<const ColumnType &>(column).getDataAt(row_num);
+    buf.write(data.data, data.size);
+}
+
+size_t BSONEachRowRowOutputFormat::countBSONFieldSize(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name)
+{
+    size_t size = 1; // Field type
+    size += name.size() + 1; // Field name and \0
+    switch (column.getDataType())
+    {
+        case TypeIndex::Int8: [[fallthrough]];
+        case TypeIndex::Int16: [[fallthrough]];
+        case TypeIndex::UInt16: [[fallthrough]];
+        case TypeIndex::Date: [[fallthrough]];
+        case TypeIndex::Date32: [[fallthrough]];
+        case TypeIndex::Decimal32: [[fallthrough]];
+        case TypeIndex::Int32:
+        {
+            return size + sizeof(Int32);
+        }
+        case TypeIndex::UInt8:
+        {
+            if (isBool(data_type))
+                return size + 1;
+
+            return size + sizeof(Int32);
+        }
+        case TypeIndex::Float32: [[fallthrough]];
+        case TypeIndex::Float64: [[fallthrough]];
+        case TypeIndex::UInt32: [[fallthrough]];
+        case TypeIndex::Int64: [[fallthrough]];
+        case TypeIndex::UInt64: [[fallthrough]];
+        case TypeIndex::DateTime: [[fallthrough]];
+        case TypeIndex::Decimal64: [[fallthrough]];
+        case TypeIndex::DateTime64:
+        {
+            return size + sizeof(UInt64);
+        }
+        case TypeIndex::Int128: [[fallthrough]];
+        case TypeIndex::UInt128: [[fallthrough]];
+        case TypeIndex::Decimal128:
+        {
+            return size + sizeof(BSONSizeT) + 1 + sizeof(UInt128); // Size of a binary + binary subtype + 16 bytes of value
+        }
+        case TypeIndex::Int256: [[fallthrough]];
+        case TypeIndex::UInt256: [[fallthrough]];
+        case TypeIndex::Decimal256:
+        {
+            return size + sizeof(BSONSizeT) + 1 + sizeof(UInt256); // Size of a binary + binary subtype + 32 bytes of value
+        }
+        case TypeIndex::String:
+        {
+            const auto & string_column = assert_cast<const ColumnString &>(column);
+            return size + sizeof(BSONSizeT) + string_column.getDataAt(row_num).size + 1; // Size of data + data + \0 or BSON subtype (in case of BSON binary)
+        }
+        case TypeIndex::FixedString:
+        {
+            const auto & string_column = assert_cast<const ColumnFixedString &>(column);
+            return size + sizeof(BSONSizeT) + string_column.getN() + 1; // Size of data + data + \0 or BSON subtype (in case of BSON binary)
+        }
+        case TypeIndex::UUID:
+        {
+            return size + sizeof(BSONSizeT) + 1 + sizeof(UUID); // Size of data + BSON binary subtype + 16 bytes of value
+        }
+        case TypeIndex::LowCardinality:
+        {
+            const auto & lc_column = assert_cast<const ColumnLowCardinality &>(column);
+            auto dict_type = assert_cast<const DataTypeLowCardinality *>(data_type.get())->getDictionaryType();
+            auto dict_column = lc_column.getDictionary().getNestedColumn();
+            size_t index = lc_column.getIndexAt(row_num);
+            return countBSONFieldSize(*dict_column, dict_type, index, name);
+        }
+        case TypeIndex::Nullable:
+        {
+            auto nested_type = removeNullable(data_type);
+            const ColumnNullable & column_nullable = assert_cast<const ColumnNullable &>(column);
+            if (column_nullable.isNullAt(row_num))
+                return size; /// Null has no value, just type
+            return countBSONFieldSize(column_nullable.getNestedColumn(), nested_type, row_num, name);
+        }
+        case TypeIndex::Array:
+        {
+            size += sizeof(BSONSizeT); // Size of a document
+
+            const auto & nested_type = assert_cast<const DataTypeArray *>(data_type.get())->getNestedType();
+            const ColumnArray & column_array = assert_cast<const ColumnArray &>(column);
+            const IColumn & nested_column = column_array.getData();
+            const ColumnArray::Offsets & offsets = column_array.getOffsets();
+            size_t offset = offsets[row_num - 1];
+            size_t array_size = offsets[row_num] - offset;
+
+            for (size_t i = 0; i < array_size; ++i)
+                size += countBSONFieldSize(nested_column, nested_type, offset + i, std::to_string(i)); // Add size of each value from array
+
+            return size + sizeof(BSON_DOCUMENT_END); // Add final \0
+        }
+        case TypeIndex::Tuple:
+        {
+            size += sizeof(BSONSizeT); // Size of a document
+
+            const auto * tuple_type = assert_cast<const DataTypeTuple *>(data_type.get());
+            const auto & nested_types = tuple_type->getElements();
+            bool have_explicit_names = tuple_type->haveExplicitNames();
+            const auto & nested_names = tuple_type->getElementNames();
+            const auto & tuple_column = assert_cast<const ColumnTuple &>(column);
+            const auto & nested_columns = tuple_column.getColumns();
+
+            for (size_t i = 0; i < nested_columns.size(); ++i)
+            {
+                String key_name = have_explicit_names ? toValidUTF8String(nested_names[i]) : std::to_string(i);
+                size += countBSONFieldSize(*nested_columns[i], nested_types[i], row_num, key_name); // Add size of each value from tuple
+            }
+
+            return size + sizeof(BSON_DOCUMENT_END); // Add final \0
+        }
+        case TypeIndex::Map:
+        {
+            size += sizeof(BSONSizeT); // Size of a document
+
+            const auto & map_type = assert_cast<const DataTypeMap &>(*data_type);
+            if (!isStringOrFixedString(map_type.getKeyType()))
+                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Only maps with String key type are supported in BSON, got key type: {}", map_type.getKeyType()->getName());
+            const auto & value_type = map_type.getValueType();
+
+            const auto & map_column = assert_cast<const ColumnMap &>(column);
+            const auto & nested_column = map_column.getNestedColumn();
+            const auto & key_value_columns = map_column.getNestedData().getColumns();
+            const auto & key_column = key_value_columns[0];
+            const auto & value_column = key_value_columns[1];
+            const auto & offsets = nested_column.getOffsets();
+            size_t offset = offsets[row_num - 1];
+            size_t map_size = offsets[row_num] - offset;
+
+            for (size_t i = 0; i < map_size; ++i)
+            {
+                String key = toValidUTF8String(key_column->getDataAt(offset + i).toString());
+                size += countBSONFieldSize(*value_column, value_type, offset + i, key);
+            }
+
+            return size + sizeof(BSON_DOCUMENT_END); // Add final \0
+        }
+        default:
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not supported in BSON output format", data_type->getName());
+    }
+}
+
+void BSONEachRowRowOutputFormat::serializeField(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name)
+{
+    switch (column.getDataType())
+    {
+        case TypeIndex::Float32:
+        {
+            writeBSONNumber<ColumnFloat32, double>(BSONType::DOUBLE, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::Float64:
+        {
+            writeBSONNumber<ColumnFloat64, double>(BSONType::DOUBLE, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::Int8:
+        {
+            writeBSONNumber<ColumnInt8, Int32>(BSONType::INT32, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::UInt8:
+        {
+            if (isBool(data_type))
+                writeBSONNumber<ColumnUInt8, bool>(BSONType::BOOL, column, row_num, name, out);
+            else
+                writeBSONNumber<ColumnUInt8, Int32>(BSONType::INT32, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::Int16:
+        {
+            writeBSONNumber<ColumnInt16, Int32>(BSONType::INT32, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::Date: [[fallthrough]];
+        case TypeIndex::UInt16:
+        {
+            writeBSONNumber<ColumnUInt16, Int32>(BSONType::INT32, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::Date32: [[fallthrough]];
+        case TypeIndex::Int32:
+        {
+            writeBSONNumber<ColumnInt32, Int32>(BSONType::INT32, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::DateTime: [[fallthrough]];
+        case TypeIndex::UInt32:
+        {
+            writeBSONNumber<ColumnUInt32, Int64>(BSONType::INT64, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::Int64:
+        {
+            writeBSONNumber<ColumnInt64, Int64>(BSONType::INT64, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::UInt64:
+        {
+            writeBSONNumber<ColumnUInt64, UInt64>(BSONType::INT64, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::Int128:
+        {
+            writeBSONBigInteger<ColumnInt128>(column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::UInt128:
+        {
+            writeBSONBigInteger<ColumnUInt128>(column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::Int256:
+        {
+            writeBSONBigInteger<ColumnInt256>(column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::UInt256:
+        {
+            writeBSONBigInteger<ColumnUInt256>(column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::Decimal32:
+        {
+            writeBSONNumber<ColumnDecimal<Decimal32>, Decimal32>(BSONType::INT32, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::DateTime64:
+        {
+            writeBSONNumber<ColumnDecimal<DateTime64>, Decimal64>(BSONType::DATETIME, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::Decimal64:
+        {
+            writeBSONNumber<ColumnDecimal<Decimal64>, Decimal64>(BSONType::INT64, column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::Decimal128:
+        {
+            writeBSONBigInteger<ColumnDecimal<Decimal128>>(column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::Decimal256:
+        {
+            writeBSONBigInteger<ColumnDecimal<Decimal256>>(column, row_num, name, out);
+            break;
+        }
+        case TypeIndex::String:
+        {
+            writeBSONString<ColumnString>(column, row_num, name, out, settings.bson.output_string_as_string);
+            break;
+        }
+        case TypeIndex::FixedString:
+        {
+            writeBSONString<ColumnFixedString>(column, row_num, name, out, settings.bson.output_string_as_string);
+            break;
+        }
+        case TypeIndex::UUID:
+        {
+            writeBSONTypeAndKeyName(BSONType::BINARY, name, out);
+            writeBSONSize(sizeof(UUID), out);
+            writeBSONType(BSONBinarySubtype::UUID, out);
+            writeBinary(assert_cast<const ColumnUUID &>(column).getElement(row_num), out);
+            break;
+        }
+        case TypeIndex::LowCardinality:
+        {
+            const auto & lc_column = assert_cast<const ColumnLowCardinality &>(column);
+            auto dict_type = assert_cast<const DataTypeLowCardinality *>(data_type.get())->getDictionaryType();
+            auto dict_column = lc_column.getDictionary().getNestedColumn();
+            size_t index = lc_column.getIndexAt(row_num);
+            serializeField(*dict_column, dict_type, index, name);
+            break;
+        }
+        case TypeIndex::Nullable:
+        {
+            auto nested_type = removeNullable(data_type);
+            const ColumnNullable & column_nullable = assert_cast<const ColumnNullable &>(column);
+            if (!column_nullable.isNullAt(row_num))
+                serializeField(column_nullable.getNestedColumn(), nested_type, row_num, name);
+            else
+                writeBSONTypeAndKeyName(BSONType::NULL_VALUE, name, out);
+            break;
+        }
+        case TypeIndex::Array:
+        {
+            const auto & nested_type = assert_cast<const DataTypeArray *>(data_type.get())->getNestedType();
+            const ColumnArray & column_array = assert_cast<const ColumnArray &>(column);
+            const IColumn & nested_column = column_array.getData();
+            const ColumnArray::Offsets & offsets = column_array.getOffsets();
+            size_t offset = offsets[row_num - 1];
+            size_t array_size = offsets[row_num] - offset;
+
+            writeBSONTypeAndKeyName(BSONType::ARRAY, name, out);
+
+            size_t document_size = sizeof(BSONSizeT);
+            for (size_t i = 0; i < array_size; ++i)
+                document_size += countBSONFieldSize(nested_column, nested_type, offset + i, std::to_string(i)); // Add size of each value from array
+            document_size += sizeof(BSON_DOCUMENT_END); // Add final \0
+
+            writeBSONSize(document_size, out);
+
+            for (size_t i = 0; i < array_size; ++i)
+                serializeField(nested_column, nested_type, offset + i, std::to_string(i));
+
+            writeChar(BSON_DOCUMENT_END, out);
+            break;
+        }
+        case TypeIndex::Tuple:
+        {
+            const auto * tuple_type = assert_cast<const DataTypeTuple *>(data_type.get());
+            const auto & nested_types = tuple_type->getElements();
+            bool have_explicit_names = tuple_type->haveExplicitNames();
+            const auto & nested_names = tuple_type->getElementNames();
+            const auto & tuple_column = assert_cast<const ColumnTuple &>(column);
+            const auto & nested_columns = tuple_column.getColumns();
+
+            BSONType bson_type = have_explicit_names ? BSONType::DOCUMENT : BSONType::ARRAY;
+            writeBSONTypeAndKeyName(bson_type, name, out);
+
+            size_t document_size = sizeof(BSONSizeT);
+            for (size_t i = 0; i < nested_columns.size(); ++i)
+            {
+                String key_name = have_explicit_names ? toValidUTF8String(nested_names[i]) : std::to_string(i);
+                document_size += countBSONFieldSize(*nested_columns[i], nested_types[i], row_num, key_name); // Add size of each value from tuple
+            }
+            document_size += sizeof(BSON_DOCUMENT_END); // Add final \0
+
+            writeBSONSize(document_size, out);
+
+            for (size_t i = 0; i < nested_columns.size(); ++i)
+                serializeField(*nested_columns[i], nested_types[i], row_num, toValidUTF8String(nested_names[i]));
+
+            writeChar(BSON_DOCUMENT_END, out);
+            break;
+        }
+        case TypeIndex::Map:
+        {
+            const auto & map_type = assert_cast<const DataTypeMap &>(*data_type);
+            if (!isStringOrFixedString(map_type.getKeyType()))
+                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Only maps with String key type are supported in BSON, got key type: {}", map_type.getKeyType()->getName());
+            const auto & value_type = map_type.getValueType();
+
+            const auto & map_column = assert_cast<const ColumnMap &>(column);
+            const auto & nested_column = map_column.getNestedColumn();
+            const auto & key_value_columns = map_column.getNestedData().getColumns();
+            const auto & key_column = key_value_columns[0];
+            const auto & value_column = key_value_columns[1];
+            const auto & offsets = nested_column.getOffsets();
+            size_t offset = offsets[row_num - 1];
+            size_t map_size = offsets[row_num] - offset;
+
+            writeBSONTypeAndKeyName(BSONType::DOCUMENT, name, out);
+
+            size_t document_size = sizeof(BSONSizeT);
+            for (size_t i = 0; i < map_size; ++i)
+            {
+                String key = toValidUTF8String(key_column->getDataAt(offset + i).toString());
+                document_size += countBSONFieldSize(*value_column, value_type, offset + i, key);
+            }
+            document_size += sizeof(BSON_DOCUMENT_END);
+
+            writeBSONSize(document_size, out);
+
+            for (size_t i = 0; i < map_size; ++i)
+            {
+                String key = toValidUTF8String(key_column->getDataAt(offset + i).toString());
+                serializeField(*value_column, value_type, offset + i, key);
+            }
+
+            writeChar(BSON_DOCUMENT_END, out);
+            break;
+        }
+        default:
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not supported in BSON output format", data_type->getName());
+    }
+}
+
+void BSONEachRowRowOutputFormat::write(const Columns & columns, size_t row_num)
+{
+    /// We should calculate and write document size before its content
+    size_t document_size = sizeof(BSONSizeT);
+    for (size_t i = 0; i != columns.size(); ++i)
+        document_size += countBSONFieldSize(*columns[i], fields[i].type, row_num, fields[i].name);
+    document_size += sizeof(BSON_DOCUMENT_END);
+
+    size_t document_start = out.count();
+    writeBSONSize(document_size, out);
+
+    for (size_t i = 0; i != columns.size(); ++i)
+        serializeField(*columns[i], fields[i].type, row_num, fields[i].name);
+
+    writeChar(BSON_DOCUMENT_END, out);
+
+    size_t actual_document_size = out.count() - document_start;
+    if (actual_document_size != document_size)
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "The actual size of the BSON document does not match the estimated size: {} != {}",
+            actual_document_size,
+            document_size);
+}
+
+void registerOutputFormatBSONEachRow(FormatFactory & factory)
+{
+    factory.registerOutputFormat(
+        "BSONEachRow",
+        [](WriteBuffer & buf, const Block & sample, const RowOutputFormatParams & params, const FormatSettings & _format_settings)
+        { return std::make_shared<BSONEachRowRowOutputFormat>(buf, sample, params, _format_settings); });
+    factory.markOutputFormatSupportsParallelFormatting("BSONEachRow");
+}
+
+}
--- a/src/Processors/Formats/Impl/BSONEachRowRowOutputFormat.h
+++ b/src/Processors/Formats/Impl/BSONEachRowRowOutputFormat.h
@ -0,0 +1,69 @@
+#pragma once
+
+#include <Core/Block.h>
+#include <Formats/FormatSettings.h>
+#include <IO/WriteBuffer.h>
+#include <Processors/Formats/IRowOutputFormat.h>
+#include <Formats/BSONTypes.h>
+
+namespace DB
+{
+
+/*
+ * Class for formatting data in BSON format.
+ * Each row is formatted as a separate BSON document.
+ * Each column is formatted as a single field with column name as a key.
+ * It uses the following correspondence between ClickHouse types and BSON types:
+ *
+ * ClickHouse type         | BSON Type
+ * Bool                    | \x08 boolean
+ * Int8/UInt8              | \x10 int32
+ * Int16UInt16             | \x10 int32
+ * Int32                   | \x10 int32
+ * UInt32                  | \x12 int64
+ * Int64                   | \x12 int64
+ * UInt64                  | \x11 uint64
+ * Float32/Float64         | \x01 double
+ * Date/Date32             | \x10 int32
+ * DateTime                | \x12 int64
+ * DateTime64              | \x09 datetime
+ * Decimal32               | \x10 int32
+ * Decimal64               | \x12 int64
+ * Decimal128              | \x05 binary, \x00 binary subtype, size = 16
+ * Decimal256              | \x05 binary, \x00 binary subtype, size = 32
+ * Int128/UInt128          | \x05 binary, \x00 binary subtype, size = 16
+ * Int256/UInt256          | \x05 binary, \x00 binary subtype, size = 32
+ * String/FixedString      | \x05 binary, \x00 binary subtype or \x02 string if setting output_format_bson_string_as_string is enabled
+ * UUID                    | \x05 binary, \x04 uuid subtype, size = 16
+ * Array                   | \x04 array
+ * Tuple                   | \x04 array
+ * Named Tuple             | \x03 document
+ * Map (with String keys)  | \x03 document
+ *
+ * Note: on Big-Endian platforms this format will not work properly.
+ */
+
+class BSONEachRowRowOutputFormat final : public IRowOutputFormat
+{
+public:
+    BSONEachRowRowOutputFormat(
+        WriteBuffer & out_, const Block & header_, const RowOutputFormatParams & params_, const FormatSettings & settings_);
+
+    String getName() const override { return "BSONEachRowRowOutputFormat"; }
+
+private:
+    void write(const Columns & columns, size_t row_num) override;
+    void writeField(const IColumn &, const ISerialization &, size_t) override { }
+
+    void serializeField(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name);
+
+    /// Count field size in bytes that we will get after serialization in BSON format.
+    /// It's needed to calculate document size before actual serialization,
+    /// because in BSON format we should write the size of the document before its content.
+    size_t countBSONFieldSize(const IColumn & column, const DataTypePtr & data_type, size_t row_num, const String & name);
+
+    NamesAndTypes fields;
+    FormatSettings settings;
+};
+
+}
--- a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp
@ -13,7 +13,7 @@ namespace ErrorCodes
    extern const int CANNOT_SKIP_UNKNOWN_FIELD;
 }

-BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
+BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, const Block & header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
    : RowInputFormatWithNamesAndTypes(
        header,
        in_,
--- a/src/Processors/Formats/Impl/BinaryRowInputFormat.h
+++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.h
@ -20,7 +20,7 @@ class ReadBuffer;
 class BinaryRowInputFormat final : public RowInputFormatWithNamesAndTypes
 {
 public:
-    BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_);
+    BinaryRowInputFormat(ReadBuffer & in_, const Block & header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_);

    String getName() const override { return "BinaryRowInputFormat"; }

--- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp
+++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp
@ -72,10 +72,10 @@ JSONColumnsBlockInputFormatBase::JSONColumnsBlockInputFormatBase(
    : IInputFormat(header_, in_)
    , format_settings(format_settings_)
    , fields(header_.getNamesAndTypes())
-    , name_to_index(header_.getNamesToIndexesMap())
    , serializations(header_.getSerializations())
    , reader(std::move(reader_))
 {
+    name_to_index = getPort().getHeader().getNamesToIndexesMap();
 }

 size_t JSONColumnsBlockInputFormatBase::readColumn(
@ -125,7 +125,7 @@ Chunk JSONColumnsBlockInputFormatBase::generate()
        {
            /// Check if this name appears in header. If no, skip this column or throw
            /// an exception according to setting input_format_skip_unknown_fields
-            if (!name_to_index.contains(*column_name))
+            if (!name_to_index.has(*column_name))
            {
                if (!format_settings.skip_unknown_fields)
                    throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown column found in input data: {}", *column_name);
--- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h
+++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h
@ -60,7 +60,7 @@ protected:
    const FormatSettings format_settings;
    const NamesAndTypes fields;
    /// Maps column names and their positions in header.
-    std::unordered_map<String, size_t> name_to_index;
+    Block::NameMap name_to_index;
    Serializations serializations;
    std::unique_ptr<JSONColumnsReaderBase> reader;
    BlockMissingValues block_missing_values;
--- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp
@ -37,25 +37,25 @@ JSONEachRowRowInputFormat::JSONEachRowRowInputFormat(
    Params params_,
    const FormatSettings & format_settings_,
    bool yield_strings_)
-    : IRowInputFormat(header_, in_, std::move(params_)), format_settings(format_settings_), name_map(header_.columns()), yield_strings(yield_strings_)
+    : IRowInputFormat(header_, in_, std::move(params_))
+    , format_settings(format_settings_)
+    , prev_positions(header_.columns())
+    , yield_strings(yield_strings_)
 {
-    size_t num_columns = getPort().getHeader().columns();
-    for (size_t i = 0; i < num_columns; ++i)
+    name_map = getPort().getHeader().getNamesToIndexesMap();
+    if (format_settings_.import_nested_json)
    {
-        const String & column_name = columnName(i);
-        name_map[column_name] = i;        /// NOTE You could place names more cache-locally.
-        if (format_settings_.import_nested_json)
+        for (size_t i = 0; i != header_.columns(); ++i)
        {
-            const auto split = Nested::splitName(column_name);
+            const StringRef column_name = header_.getByPosition(i).name;
+            const auto split = Nested::splitName(column_name.toView());
            if (!split.second.empty())
            {
-                const StringRef table_name(column_name.data(), split.first.size());
+                const StringRef table_name(column_name.data, split.first.size());
                name_map[table_name] = NESTED_FIELD;
            }
        }
    }
-
-    prev_positions.resize(num_columns);
 }

 const String & JSONEachRowRowInputFormat::columnName(size_t i) const
--- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp
@ -32,6 +32,7 @@
 #include <Columns/ColumnLowCardinality.h>

 #include <Formats/MsgPackExtensionTypes.h>
+#include <Formats/EscapingRuleUtils.h>

 namespace DB
 {
@ -552,12 +553,9 @@ void registerMsgPackSchemaReader(FormatFactory & factory)
    });
    factory.registerAdditionalInfoForSchemaCacheGetter("MsgPack", [](const FormatSettings & settings)
    {
-            return fmt::format(
-                "number_of_columns={}, schema_inference_hints={}, max_rows_to_read_for_schema_inference={}",
-                settings.msgpack.number_of_columns,
-                settings.schema_inference_hints,
-                settings.max_rows_to_read_for_schema_inference);
-        });
+            String result = getAdditionalFormatInfoForAllRowBasedFormats(settings);
+            return result + fmt::format(", number_of_columns={}", settings.msgpack.number_of_columns);
+    });
 }

 }
--- a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp
@ -35,9 +35,9 @@ MySQLDumpRowInputFormat::MySQLDumpRowInputFormat(ReadBuffer & in_, const Block &
    : IRowInputFormat(header_, in_, params_)
    , table_name(format_settings_.mysql_dump.table_name)
    , types(header_.getDataTypes())
-    , column_indexes_by_names(header_.getNamesToIndexesMap())
    , format_settings(format_settings_)
 {
+    column_indexes_by_names = getPort().getHeader().getNamesToIndexesMap();
 }


--- a/Show More
+++ b/Show More