Merge branch 'master' into fast-count-from-files

2024-11-22 07:31:57 +00:00 · 2023-08-22 15:03:48 +02:00 · 2023-08-22 15:03:48 +02:00 · 67c5c0203b
commit 67c5c0203b
parent fed253a671 e6fb5a21cb
202 changed files with 4414 additions and 768 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -347,3 +347,15 @@
 [submodule "contrib/incbin"]
 	path = contrib/incbin
 	url = https://github.com/graphitemaster/incbin.git
+[submodule "contrib/usearch"]
+	path = contrib/usearch
+	url = https://github.com/unum-cloud/usearch.git
+[submodule "contrib/SimSIMD"]
+	path = contrib/SimSIMD
+	url = https://github.com/ashvardanian/SimSIMD.git
+[submodule "contrib/FP16"]
+	path = contrib/FP16
+	url = https://github.com/Maratyszcza/FP16.git
+[submodule "contrib/robin-map"]
+	path = contrib/robin-map
+	url = https://github.com/Tessil/robin-map.git
--- a/base/base/StringRef.h
+++ b/base/base/StringRef.h
@ -11,6 +11,7 @@
 #include <base/defines.h>
 #include <base/types.h>
 #include <base/unaligned.h>
+#include <base/simd.h>

 #include <city.h>

@ -29,6 +30,11 @@
    #define CRC_INT __crc32cd
 #endif

+#if defined(__aarch64__) && defined(__ARM_NEON)
+    #include <arm_neon.h>
+    #pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+

 /**
 * The std::string_view-like container to avoid creating strings to find substrings in the hash table.
@ -74,14 +80,14 @@ using StringRefs = std::vector<StringRef>;
  * For more information, see hash_map_string_2.cpp
  */

-inline bool compareSSE2(const char * p1, const char * p2)
+inline bool compare8(const char * p1, const char * p2)
 {
    return 0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(p1)),
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(p2))));
 }

-inline bool compareSSE2x4(const char * p1, const char * p2)
+inline bool compare64(const char * p1, const char * p2)
 {
    return 0xFFFF == _mm_movemask_epi8(
        _mm_and_si128(
@ -101,7 +107,30 @@ inline bool compareSSE2x4(const char * p1, const char * p2)
                    _mm_loadu_si128(reinterpret_cast<const __m128i *>(p2) + 3)))));
 }

-inline bool memequalSSE2Wide(const char * p1, const char * p2, size_t size)
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+inline bool compare8(const char * p1, const char * p2)
+{
+    uint64_t mask = getNibbleMask(vceqq_u8(
+            vld1q_u8(reinterpret_cast<const unsigned char *>(p1)), vld1q_u8(reinterpret_cast<const unsigned char *>(p2))));
+    return 0xFFFFFFFFFFFFFFFF == mask;
+}
+
+inline bool compare64(const char * p1, const char * p2)
+{
+    uint64_t mask = getNibbleMask(vandq_u8(
+        vandq_u8(vceqq_u8(vld1q_u8(reinterpret_cast<const unsigned char *>(p1)), vld1q_u8(reinterpret_cast<const unsigned char *>(p2))),
+            vceqq_u8(vld1q_u8(reinterpret_cast<const unsigned char *>(p1 + 16)), vld1q_u8(reinterpret_cast<const unsigned char *>(p2 + 16)))),
+        vandq_u8(vceqq_u8(vld1q_u8(reinterpret_cast<const unsigned char *>(p1 + 32)), vld1q_u8(reinterpret_cast<const unsigned char *>(p2 + 32))),
+            vceqq_u8(vld1q_u8(reinterpret_cast<const unsigned char *>(p1 + 48)), vld1q_u8(reinterpret_cast<const unsigned char *>(p2 + 48))))));
+    return 0xFFFFFFFFFFFFFFFF == mask;
+}
+
+#endif
+
+#if defined(__SSE2__) || (defined(__aarch64__) && defined(__ARM_NEON))
+
+inline bool memequalWide(const char * p1, const char * p2, size_t size)
 {
    /** The order of branches and the trick with overlapping comparisons
      * are the same as in memcpy implementation.
@ -138,7 +167,7 @@ inline bool memequalSSE2Wide(const char * p1, const char * p2, size_t size)

    while (size >= 64)
    {
-        if (compareSSE2x4(p1, p2))
+        if (compare64(p1, p2))
        {
            p1 += 64;
            p2 += 64;
@ -150,17 +179,16 @@ inline bool memequalSSE2Wide(const char * p1, const char * p2, size_t size)

    switch (size / 16)
    {
-        case 3: if (!compareSSE2(p1 + 32, p2 + 32)) return false; [[fallthrough]];
-        case 2: if (!compareSSE2(p1 + 16, p2 + 16)) return false; [[fallthrough]];
-        case 1: if (!compareSSE2(p1, p2)) return false;
+        case 3: if (!compare8(p1 + 32, p2 + 32)) return false; [[fallthrough]];
+        case 2: if (!compare8(p1 + 16, p2 + 16)) return false; [[fallthrough]];
+        case 1: if (!compare8(p1, p2)) return false;
    }

-    return compareSSE2(p1 + size - 16, p2 + size - 16);
+    return compare8(p1 + size - 16, p2 + size - 16);
 }

 #endif

-
 inline bool operator== (StringRef lhs, StringRef rhs)
 {
    if (lhs.size != rhs.size)
@ -169,8 +197,8 @@ inline bool operator== (StringRef lhs, StringRef rhs)
    if (lhs.size == 0)
        return true;

-#if defined(__SSE2__)
-    return memequalSSE2Wide(lhs.data, rhs.data, lhs.size);
+#if defined(__SSE2__) || (defined(__aarch64__) && defined(__ARM_NEON))
+    return memequalWide(lhs.data, rhs.data, lhs.size);
 #else
    return 0 == memcmp(lhs.data, rhs.data, lhs.size);
 #endif
--- a/base/base/simd.h
+++ b/base/base/simd.h
@ -0,0 +1,14 @@
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_NEON)
+
+#    include <arm_neon.h>
+#      pragma clang diagnostic ignored "-Wreserved-identifier"
+
+/// Returns a 64 bit mask of nibbles (4 bits for each byte).
+inline uint64_t getNibbleMask(uint8x16_t res)
+{
+    return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(res), 4)), 0);
+}
+
+#endif
--- a/cmake/dbms_glob_sources.cmake
+++ b/cmake/dbms_glob_sources.cmake
@ -4,10 +4,19 @@ macro(add_glob cur_list)
 endmacro()

 macro(add_headers_and_sources prefix common_path)
-    add_glob(${prefix}_headers ${CMAKE_CURRENT_SOURCE_DIR} ${common_path}/*.h)
-    add_glob(${prefix}_sources ${common_path}/*.cpp ${common_path}/*.c ${common_path}/*.h)
+    add_glob(${prefix}_headers ${common_path}/*.h)
+    add_glob(${prefix}_sources ${common_path}/*.cpp ${common_path}/*.c)
 endmacro()

 macro(add_headers_only prefix common_path)
-    add_glob(${prefix}_headers ${CMAKE_CURRENT_SOURCE_DIR} ${common_path}/*.h)
+    add_glob(${prefix}_headers ${common_path}/*.h)
+endmacro()
+
+macro(extract_into_parent_list src_list dest_list)
+    list(REMOVE_ITEM ${src_list} ${ARGN})
+    get_filename_component(__dir_name ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+    foreach(file IN ITEMS ${ARGN})
+        list(APPEND ${dest_list} ${__dir_name}/${file})
+    endforeach()
+    set(${dest_list} "${${dest_list}}" PARENT_SCOPE)
 endmacro()
--- a/cmake/target.cmake
+++ b/cmake/target.cmake
@ -19,6 +19,19 @@ else ()
    message (FATAL_ERROR "Platform ${CMAKE_SYSTEM_NAME} is not supported")
 endif ()

+# Since we always use toolchain files to generate hermetic builds, cmake will
+# always think it's a cross-compilation, See
+# https://cmake.org/cmake/help/latest/variable/CMAKE_CROSSCOMPILING.html
+#
+# This will slow down cmake configuration and compilation. For instance, LLVM
+# will try to configure NATIVE LLVM targets with all tests enabled (You'll see
+# Building native llvm-tblgen...).
+#
+# Here, we set it manually by checking the system name and processor.
+if (${CMAKE_SYSTEM_NAME} STREQUAL ${CMAKE_HOST_SYSTEM_NAME} AND ${CMAKE_SYSTEM_PROCESSOR} STREQUAL ${CMAKE_HOST_SYSTEM_PROCESSOR})
+    set (CMAKE_CROSSCOMPILING 0)
+endif ()
+
 if (CMAKE_CROSSCOMPILING)
    if (OS_DARWIN)
        # FIXME: broken dependencies
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -196,6 +196,17 @@ if (ARCH_S390X)
    add_contrib(crc32-s390x-cmake crc32-s390x)
 endif()
 add_contrib (annoy-cmake annoy)
+
+option(ENABLE_USEARCH "Enable USearch (Approximate Neighborhood Search, HNSW) support" ${ENABLE_LIBRARIES})
+if (ENABLE_USEARCH)
+    add_contrib (FP16-cmake FP16)
+    add_contrib (robin-map-cmake robin-map)
+    add_contrib (SimSIMD-cmake SimSIMD)
+    add_contrib (usearch-cmake usearch) # requires: FP16, robin-map, SimdSIMD
+else ()
+    message(STATUS "Not using USearch")
+endif ()
+
 add_contrib (xxHash-cmake xxHash)

 add_contrib (libbcrypt-cmake libbcrypt)
--- a/contrib/FP16
+++ b/contrib/FP16
@ -0,0 +1 @@
+Subproject commit 0a92994d729ff76a58f692d3028ca1b64b145d91
--- a/contrib/FP16-cmake/CMakeLists.txt
+++ b/contrib/FP16-cmake/CMakeLists.txt
@ -0,0 +1 @@
+# See contrib/usearch-cmake/CMakeLists.txt
--- a/contrib/SimSIMD
+++ b/contrib/SimSIMD
@ -0,0 +1 @@
+Subproject commit de2cb75b9e9e3389d5e1e51fd9f8ed151f3c17cf
--- a/contrib/SimSIMD-cmake/CMakeLists.txt
+++ b/contrib/SimSIMD-cmake/CMakeLists.txt
@ -0,0 +1 @@
+# See contrib/usearch-cmake/CMakeLists.txt
--- a/contrib/boost
+++ b/contrib/boost
@ -1 +1 @@
-Subproject commit bb179652862b528d94a9032a784796c4db846c3f
+Subproject commit 063a9372b4ae304e869a5c5724971d0501552731
--- a/contrib/boost-cmake/CMakeLists.txt
+++ b/contrib/boost-cmake/CMakeLists.txt
@ -19,6 +19,12 @@ add_library (_boost_filesystem ${SRCS_FILESYSTEM})
 add_library (boost::filesystem ALIAS _boost_filesystem)
 target_include_directories (_boost_filesystem SYSTEM BEFORE PUBLIC ${LIBRARY_DIR})

+if (OS_LINUX)
+    target_compile_definitions (_boost_filesystem PRIVATE
+        BOOST_FILESYSTEM_HAS_POSIX_AT_APIS=1
+    )
+endif ()
+
 # headers-only

 add_library (_boost_headers_only INTERFACE)
--- a/contrib/isa-l-cmake/CMakeLists.txt
+++ b/contrib/isa-l-cmake/CMakeLists.txt
@ -1,6 +1,7 @@
 option(ENABLE_ISAL_LIBRARY "Enable ISA-L library" ${ENABLE_LIBRARIES})
-if (ARCH_AARCH64)
-    # Disable ISA-L libray on aarch64.
+
+# ISA-L is only available for x86-64, so it shall be disabled for other platforms
+if (NOT ARCH_AMD64)
    set (ENABLE_ISAL_LIBRARY OFF)
 endif ()

--- a/contrib/krb5
+++ b/contrib/krb5
@ -1 +1 @@
-Subproject commit 1d5c970e9369f444caf81d1d06a231a6bad8581f
+Subproject commit 71b06c2276009ae649c7703019f3b4605f66fd3d
--- a/contrib/llvm-project
+++ b/contrib/llvm-project
@ -1 +1 @@
-Subproject commit 4ef26de16c229429141e424375142c9b03234b66
+Subproject commit e7b8befca85c8b847614432dba250c22d35fbae0
--- a/contrib/orc
+++ b/contrib/orc
@ -1 +1 @@
-Subproject commit 568d1d60c250af1890f226c182bc15bd8cc94cf1
+Subproject commit a20d1d9d7ad4a4be7b7ba97588e16ca8b9abb2b6
--- a/contrib/robin-map
+++ b/contrib/robin-map
@ -0,0 +1 @@
+Subproject commit 851a59e0e3063ee0e23089062090a73fd3de482d
--- a/contrib/robin-map-cmake/CMakeLists.txt
+++ b/contrib/robin-map-cmake/CMakeLists.txt
@ -0,0 +1 @@
+# See contrib/usearch-cmake/CMakeLists.txt
--- a/contrib/snappy
+++ b/contrib/snappy
@ -1 +1 @@
-Subproject commit fb057edfed820212076239fd32cb2ff23e9016bf
+Subproject commit 6ebb5b1ab8801ea3fde103c5c29f5ab86df5fe7a
--- a/contrib/usearch
+++ b/contrib/usearch
@ -0,0 +1 @@
+Subproject commit 387b78b28b17b8954024ffc81e97cbcfa10d1f30
--- a/contrib/usearch-cmake/CMakeLists.txt
+++ b/contrib/usearch-cmake/CMakeLists.txt
@ -0,0 +1,17 @@
+set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch")
+set(USEARCH_SOURCE_DIR "${USEARCH_PROJECT_DIR}/include")
+
+set(FP16_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/FP16")
+set(ROBIN_MAP_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/robin-map")
+set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD-map")
+
+add_library(_usearch INTERFACE)
+
+target_include_directories(_usearch SYSTEM INTERFACE
+    ${FP16_PROJECT_DIR}/include
+    ${ROBIN_MAP_PROJECT_DIR}/include
+    ${SIMSIMD_PROJECT_DIR}/include
+    ${USEARCH_SOURCE_DIR})
+
+add_library(ch_contrib::usearch ALIAS _usearch)
+target_compile_definitions(_usearch INTERFACE ENABLE_USEARCH)
--- a/docker/test/integration/runner/compose/docker_compose_keeper.yml
+++ b/docker/test/integration/runner/compose/docker_compose_keeper.yml
@ -20,6 +20,9 @@ services:
            - type: ${keeper_fs:-tmpfs}
              source: ${keeper_db_dir1:-}
              target: /var/lib/clickhouse-keeper
+            - type: ${keeper_fs:-tmpfs}
+              source: ${keeper_db_dir1:-}
+              target: /var/lib/clickhouse
        entrypoint: "${keeper_cmd_prefix:-clickhouse keeper} --config=/etc/clickhouse-keeper/keeper_config1.xml --log-file=/var/log/clickhouse-keeper/clickhouse-keeper.log --errorlog-file=/var/log/clickhouse-keeper/clickhouse-keeper.err.log"
        cap_add:
            - SYS_PTRACE
@ -53,6 +56,9 @@ services:
            - type: ${keeper_fs:-tmpfs}
              source: ${keeper_db_dir2:-}
              target: /var/lib/clickhouse-keeper
+            - type: ${keeper_fs:-tmpfs}
+              source: ${keeper_db_dir1:-}
+              target: /var/lib/clickhouse
        entrypoint: "${keeper_cmd_prefix:-clickhouse keeper} --config=/etc/clickhouse-keeper/keeper_config2.xml --log-file=/var/log/clickhouse-keeper/clickhouse-keeper.log --errorlog-file=/var/log/clickhouse-keeper/clickhouse-keeper.err.log"
        cap_add:
            - SYS_PTRACE
@ -86,6 +92,9 @@ services:
            - type: ${keeper_fs:-tmpfs}
              source: ${keeper_db_dir3:-}
              target: /var/lib/clickhouse-keeper
+            - type: ${keeper_fs:-tmpfs}
+              source: ${keeper_db_dir1:-}
+              target: /var/lib/clickhouse
        entrypoint: "${keeper_cmd_prefix:-clickhouse keeper} --config=/etc/clickhouse-keeper/keeper_config3.xml --log-file=/var/log/clickhouse-keeper/clickhouse-keeper.log --errorlog-file=/var/log/clickhouse-keeper/clickhouse-keeper.err.log"
        cap_add:
            - SYS_PTRACE
--- a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
+++ b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
@ -19,9 +19,9 @@
            <max_threads>12</max_threads>

            <!-- disable JIT for perf tests -->
-            <compile_expressions>1</compile_expressions>
-            <compile_aggregate_expressions>1</compile_aggregate_expressions>
-            <compile_sort_description>1</compile_sort_description>
+            <compile_expressions>0</compile_expressions>
+            <compile_aggregate_expressions>0</compile_aggregate_expressions>
+            <compile_sort_description>0</compile_sort_description>

            <!-- Don't fail some prewarm queries too early -->
            <timeout_before_checking_execution_speed>60</timeout_before_checking_execution_speed>
--- a/docker/test/upgrade/run.sh
+++ b/docker/test/upgrade/run.sh
@ -63,6 +63,7 @@ configure
 # it contains some new settings, but we can safely remove it
 rm /etc/clickhouse-server/config.d/merge_tree.xml
 rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml
+rm /etc/clickhouse-server/config.d/filesystem_caches_path.xml
 rm /etc/clickhouse-server/users.d/nonconst_timezone.xml

 start
@ -93,6 +94,7 @@ sudo chgrp clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_defau
 # it contains some new settings, but we can safely remove it
 rm /etc/clickhouse-server/config.d/merge_tree.xml
 rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml
+rm /etc/clickhouse-server/config.d/filesystem_caches_path.xml
 rm /etc/clickhouse-server/users.d/nonconst_timezone.xml

 start
--- a/docs/en/engines/table-engines/integrations/materialized-postgresql.md
+++ b/docs/en/engines/table-engines/integrations/materialized-postgresql.md
@ -13,7 +13,7 @@ If more than one table is required, it is highly recommended to use the [Materia

 ``` sql
 CREATE TABLE postgresql_db.postgresql_replica (key UInt64, value UInt64)
-ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgresql_replica', 'postgres_user', 'postgres_password')
+ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgresql_table', 'postgres_user', 'postgres_password')
 PRIMARY KEY key;
 ```

--- a/docs/en/engines/table-engines/mergetree-family/annindexes.md
+++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md
@ -142,13 +142,15 @@ was specified for ANN indexes, the default value is 100 million.

 - [Annoy](/docs/en/engines/table-engines/mergetree-family/annindexes.md#annoy-annoy)

+- [USearch](/docs/en/engines/table-engines/mergetree-family/annindexes.md#usearch-usearch)
+
 ## Annoy {#annoy}

 Annoy indexes are currently experimental, to use them you first need to `SET allow_experimental_annoy_index = 1`. They are also currently
 disabled on ARM due to memory safety problems with the algorithm.

-This type of ANN index implements [the Annoy algorithm](https://github.com/spotify/annoy) which is based on a recursive division of the
-space in random linear surfaces (lines in 2D, planes in 3D etc.).
+This type of ANN index is based on the [Annoy library](https://github.com/spotify/annoy) which recursively divides the space into random
+linear surfaces (lines in 2D, planes in 3D etc.).

 <div class='vimeo-container'>
  <iframe src="//www.youtube.com/embed/QkCCyLW0ehU"
@ -216,3 +218,60 @@ ORDER BY L2Distance(vectors, Point)
 LIMIT N
 SETTINGS annoy_index_search_k_nodes=100;
 ```
+
+## USearch {#usearch}
+
+This type of ANN index is based on the [the USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW
+algorithm](https://arxiv.org/abs/1603.09320), i.e., builds a hierarchical graph where each point represents a vector and the edges represent
+similarity. Such hierarchical structures can be very efficient on large collections. They may often fetch 0.05% or less data from the
+overall dataset, while still providing 99% recall. This is especially useful when working with high-dimensional vectors,
+that are expensive to load and compare. The library also has several hardware-specific SIMD optimizations to accelerate further
+distance computations on modern Arm (NEON and SVE) and x86 (AVX2 and AVX-512) CPUs and OS-specific optimizations to allow efficient
+navigation around immutable persistent files, without loading them into RAM.
+
+<div class='vimeo-container'>
+  <iframe src="//www.youtube.com/embed/UMrhB3icP9w"
+    width="640"
+    height="360"
+    frameborder="0"
+    allow="autoplay;
+    fullscreen;
+    picture-in-picture"
+    allowfullscreen>
+  </iframe>
+</div>
+
+Syntax to create an USearch index over an [Array](../../../sql-reference/data-types/array.md) column:
+
+```sql
+CREATE TABLE table_with_usearch_index
+(
+  id Int64,
+  vectors Array(Float32),
+  INDEX [ann_index_name] vectors TYPE usearch([Distance]) [GRANULARITY N]
+)
+ENGINE = MergeTree
+ORDER BY id;
+```
+
+Syntax to create an ANN index over a [Tuple](../../../sql-reference/data-types/tuple.md) column:
+
+```sql
+CREATE TABLE table_with_usearch_index
+(
+  id Int64,
+  vectors Tuple(Float32[, Float32[, ...]]),
+  INDEX [ann_index_name] vectors TYPE usearch([Distance]) [GRANULARITY N]
+)
+ENGINE = MergeTree
+ORDER BY id;
+```
+
+USearch currently supports two distance functions:
+- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space
+  ([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)).
+- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors
+  ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)).
+
+For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no
+distance function was specified during index creation, `L2Distance` is used as default.
--- a/docs/en/interfaces/images/mysql1.png
+++ b/docs/en/interfaces/images/mysql1.png
--- a/docs/en/interfaces/images/mysql2.png
+++ b/docs/en/interfaces/images/mysql2.png
--- a/docs/en/interfaces/images/mysql3.png
+++ b/docs/en/interfaces/images/mysql3.png
--- a/docs/en/interfaces/images/mysql4.png
+++ b/docs/en/interfaces/images/mysql4.png
--- a/docs/en/interfaces/images/mysql5.png
+++ b/docs/en/interfaces/images/mysql5.png
--- a/docs/en/interfaces/mysql.md
+++ b/docs/en/interfaces/mysql.md
@ -6,7 +6,34 @@ sidebar_label: MySQL Interface

 # MySQL Interface

-ClickHouse supports MySQL wire protocol. To enable the MySQL wire protocol, add the [mysql_port](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-mysql_port) setting to your server's configuration file. For example, you could define the port in a new XML file in your `config.d` folder:
+ClickHouse supports the MySQL wire protocol. This allow tools that are MySQL-compatible to interact with ClickHouse seamlessly (e.g. [Looker Studio](../integrations/data-visualization/looker-studio-and-clickhouse.md)).
+
+## Enabling the MySQL Interface On ClickHouse Cloud
+
+1. After creating your ClickHouse Cloud Service, on the credentials screen, select the MySQL tab
+
+![Credentials screen - Prompt](./images/mysql1.png)
+
+2. Toggle the switch to enable the MySQL interface for this specific service. This will expose port `3306` for this service and prompt you with your MySQL connection screen that include your unique MySQL username. The password will be the same as the service's default user password.
+
+![Credentials screen - Enabled MySQL](./images/mysql2.png)
+
+Alternatively, in order to enable the MySQL interface for an existing service:
+
+1. Ensure your service is in `Running` state then click on the "View connection string" button for the service you want to enable the MySQL interface for
+
+![Connection screen - Prompt MySQL](./images/mysql3.png)
+
+2. Toggle the switch to enable the MySQL interface for this specific service. This will prompt you to enter the default password.
+
+![Connection screen - Prompt MySQL](./images/mysql4.png)
+
+3. After entering the password, you will get prompted the MySQL connection string for this service
+![Connection screen -  MySQL Enabled](./images/mysql5.png)
+
+## Enabling the MySQL Interface On Self-managed ClickHouse
+
+Add the [mysql_port](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-mysql_port) setting to your server's configuration file. For example, you could define the port in a new XML file in your `config.d/` [folder](../operations/configuration-files):

 ``` xml
 <clickhouse>
@ -20,7 +47,7 @@ Startup your ClickHouse server and look for a log message similar to the followi
 {} <Information> Application: Listening for MySQL compatibility protocol: 127.0.0.1:9004
 ```

-## Connect mysql to ClickHouse
+## Connect MySQL to ClickHouse

 The following command demonstrates how to connect the MySQL client `mysql` to ClickHouse:

--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -221,6 +221,10 @@ Default: 1024

 Size of cache for index marks. Zero means disabled.

+:::note
+This setting can be modified at runtime and will take effect immediately.
+:::
+
 Type: UInt64

 Default: 0
@ -230,6 +234,10 @@ Default: 0

 Size of cache for uncompressed blocks of MergeTree indices. Zero means disabled.

+:::note
+This setting can be modified at runtime and will take effect immediately.
+:::
+
 Type: UInt64

 Default: 0
@ -255,6 +263,10 @@ Default: SLRU

 Size of cache for marks (index of MergeTree family of tables).

+:::note
+This setting can be modified at runtime and will take effect immediately.
+:::
+
 Type: UInt64

 Default: 5368709120
@ -288,7 +300,7 @@ Default: 1000
 Limit on total number of concurrently executed queries. Zero means Unlimited. Note that limits on insert and select queries, and on the maximum number of queries for users must also be considered.  See also max_concurrent_insert_queries, max_concurrent_select_queries, max_concurrent_queries_for_all_users. Zero means unlimited.

 :::note
-These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
+This setting can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
 :::

 Type: UInt64
@ -300,7 +312,7 @@ Default: 0
 Limit on total number of concurrent insert queries. Zero means Unlimited.

 :::note
-These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
+This setting can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
 :::

 Type: UInt64
@ -312,7 +324,7 @@ Default: 0
 Limit on total number of concurrently select queries. Zero means Unlimited.

 :::note
-These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
+This setting can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
 :::

 Type: UInt64
@ -456,6 +468,10 @@ Sets the cache size (in bytes) for mapped files. This setting allows avoiding fr

 Note that the amount of data in mapped files does not consume memory directly and is not accounted for in query or server memory usage — because this memory can be discarded similar to the OS page cache. The cache is dropped (the files are closed) automatically on the removal of old parts in tables of the MergeTree family, also it can be dropped manually by the `SYSTEM DROP MMAP CACHE` query.

+:::note
+This setting can be modified at runtime and will take effect immediately.
+:::
+
 Type: UInt64

 Default: 1000
@ -605,6 +621,10 @@ There is one shared cache for the server. Memory is allocated on demand. The cac

 The uncompressed cache is advantageous for very short queries in individual cases.

+:::note
+This setting can be modified at runtime and will take effect immediately.
+:::
+
 Type: UInt64

 Default: 0
--- a/docs/en/operations/system-tables/clusters.md
+++ b/docs/en/operations/system-tables/clusters.md
@ -23,6 +23,7 @@ Columns:
 - `database_shard_name` ([String](../../sql-reference/data-types/string.md)) — The name of the `Replicated` database shard (for clusters that belong to a `Replicated` database).
 - `database_replica_name` ([String](../../sql-reference/data-types/string.md)) — The name of the `Replicated` database replica (for clusters that belong to a `Replicated` database).
 - `is_active` ([Nullable(UInt8)](../../sql-reference/data-types/int-uint.md)) — The status of the `Replicated` database replica (for clusters that belong to a `Replicated` database): 1 means "replica is online", 0 means "replica is offline", `NULL` means "unknown".
+- `name` ([String](../../sql-reference/data-types/string.md)) - An alias to cluster.

 **Example**

--- a/docs/en/sql-reference/statements/system.md
+++ b/docs/en/sql-reference/statements/system.md
@ -66,13 +66,13 @@ RELOAD FUNCTION [ON CLUSTER cluster_name] function_name

 ## DROP DNS CACHE

-Resets ClickHouse’s internal DNS cache. Sometimes (for old ClickHouse versions) it is necessary to use this command when changing the infrastructure (changing the IP address of another ClickHouse server or the server used by dictionaries).
+Clears ClickHouse’s internal DNS cache. Sometimes (for old ClickHouse versions) it is necessary to use this command when changing the infrastructure (changing the IP address of another ClickHouse server or the server used by dictionaries).

 For more convenient (automatic) cache management, see disable_internal_dns_cache, dns_cache_update_period parameters.

 ## DROP MARK CACHE

-Resets the mark cache.
+Clears the mark cache.

 ## DROP REPLICA

@ -106,22 +106,18 @@ Similar to `SYSTEM DROP REPLICA`, but removes the `Replicated` database replica

 ## DROP UNCOMPRESSED CACHE

-Reset the uncompressed data cache.
+Clears the uncompressed data cache.
 The uncompressed data cache is enabled/disabled with the query/user/profile-level setting [use_uncompressed_cache](../../operations/settings/settings.md#setting-use_uncompressed_cache).
 Its size can be configured using the server-level setting [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md#server-settings-uncompressed_cache_size).

 ## DROP COMPILED EXPRESSION CACHE

-Reset the compiled expression cache.
+Clears the compiled expression cache.
 The compiled expression cache is enabled/disabled with the query/user/profile-level setting [compile_expressions](../../operations/settings/settings.md#compile-expressions).

 ## DROP QUERY CACHE

-Resets the [query cache](../../operations/query-cache.md).
-
-```sql
-SYSTEM DROP QUERY CACHE [ON CLUSTER cluster_name]
-```
+Clears the [query cache](../../operations/query-cache.md).

 ## FLUSH LOGS

@ -443,9 +439,9 @@ SYSTEM STOP LISTEN [ON CLUSTER cluster_name] [QUERIES ALL | QUERIES DEFAULT | QU
 ```

 - If `CUSTOM 'protocol'` modifier is specified, the custom protocol with the specified name defined in the protocols section of the server configuration will be stopped.
- If `QUERIES ALL` modifier is specified, all protocols are stopped.
- If `QUERIES DEFAULT` modifier is specified, all default protocols are stopped.
- If `QUERIES CUSTOM` modifier is specified, all custom protocols are stopped.
+- If `QUERIES ALL [EXCEPT .. [,..]]` modifier is specified, all protocols are stopped, unless specified with `EXCEPT` clause.
+- If `QUERIES DEFAULT [EXCEPT .. [,..]]` modifier is specified, all default protocols are stopped, unless specified with `EXCEPT` clause.
+- If `QUERIES CUSTOM [EXCEPT .. [,..]]` modifier is specified, all custom protocols are stopped, unless specified with `EXCEPT` clause.

 ### SYSTEM START LISTEN

--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@ -668,8 +668,7 @@ void LocalServer::processConfig()
        uncompressed_cache_size = max_cache_size;
        LOG_INFO(log, "Lowered uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
    }
-    if (uncompressed_cache_size)
-        global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size);
+    global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size);

    String mark_cache_policy = config().getString("mark_cache_policy", DEFAULT_MARK_CACHE_POLICY);
    size_t mark_cache_size = config().getUInt64("mark_cache_size", DEFAULT_MARK_CACHE_MAX_SIZE);
@ -680,8 +679,7 @@ void LocalServer::processConfig()
        mark_cache_size = max_cache_size;
        LOG_INFO(log, "Lowered mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(mark_cache_size));
    }
-    if (mark_cache_size)
-        global_context->setMarkCache(mark_cache_policy, mark_cache_size);
+    global_context->setMarkCache(mark_cache_policy, mark_cache_size);

    size_t index_uncompressed_cache_size = config().getUInt64("index_uncompressed_cache_size", DEFAULT_INDEX_UNCOMPRESSED_CACHE_MAX_SIZE);
    if (index_uncompressed_cache_size > max_cache_size)
@ -689,8 +687,7 @@ void LocalServer::processConfig()
        index_uncompressed_cache_size = max_cache_size;
        LOG_INFO(log, "Lowered index uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
    }
-    if (index_uncompressed_cache_size)
-        global_context->setIndexUncompressedCache(index_uncompressed_cache_size);
+    global_context->setIndexUncompressedCache(index_uncompressed_cache_size);

    size_t index_mark_cache_size = config().getUInt64("index_mark_cache_size", DEFAULT_INDEX_MARK_CACHE_MAX_SIZE);
    if (index_mark_cache_size > max_cache_size)
@ -698,8 +695,7 @@ void LocalServer::processConfig()
        index_mark_cache_size = max_cache_size;
        LOG_INFO(log, "Lowered index mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
    }
-    if (index_mark_cache_size)
-        global_context->setIndexMarkCache(index_mark_cache_size);
+    global_context->setIndexMarkCache(index_mark_cache_size);

    size_t mmap_cache_size = config().getUInt64("mmap_cache_size", DEFAULT_MMAP_CACHE_MAX_SIZE);
    if (mmap_cache_size > max_cache_size)
@ -707,11 +703,10 @@ void LocalServer::processConfig()
        mmap_cache_size = max_cache_size;
        LOG_INFO(log, "Lowered mmap file cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
    }
-    if (mmap_cache_size)
-        global_context->setMMappedFileCache(mmap_cache_size);
+    global_context->setMMappedFileCache(mmap_cache_size);

-    /// In Server.cpp (./clickhouse-server), we would initialize the query cache here.
-    /// Intentionally not doing this in clickhouse-local as it doesn't make sense.
+    /// Initialize a dummy query cache.
+    global_context->setQueryCache(0, 0, 0, 0);

 #if USE_EMBEDDED_COMPILER
    size_t compiled_expression_cache_max_size_in_bytes = config().getUInt64("compiled_expression_cache_size", DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_SIZE);
--- a/programs/obfuscator/Obfuscator.cpp
+++ b/programs/obfuscator/Obfuscator.cpp
@ -390,7 +390,10 @@ static void transformFixedString(const UInt8 * src, UInt8 * dst, size_t size, UI

 static void transformUUID(const UUID & src_uuid, UUID & dst_uuid, UInt64 seed)
 {
-    const UInt128 & src = src_uuid.toUnderType();
+    auto src_copy = src_uuid;
+    transformEndianness<std::endian::little, std::endian::native>(src_copy);
+
+    const UInt128 & src = src_copy.toUnderType();
    UInt128 & dst = dst_uuid.toUnderType();

    SipHash hash;
@ -400,8 +403,9 @@ static void transformUUID(const UUID & src_uuid, UUID & dst_uuid, UInt64 seed)
    /// Saving version and variant from an old UUID
    dst = hash.get128();

-    dst.items[1] = (dst.items[1] & 0x1fffffffffffffffull) | (src.items[1] & 0xe000000000000000ull);
-    dst.items[0] = (dst.items[0] & 0xffffffffffff0fffull) | (src.items[0] & 0x000000000000f000ull);
+    const UInt64 trace[2] = {0x000000000000f000ull, 0xe000000000000000ull};
+    UUIDHelpers::getLowBytes(dst_uuid) = (UUIDHelpers::getLowBytes(dst_uuid) & (0xffffffffffffffffull - trace[1])) | (UUIDHelpers::getLowBytes(src_uuid) & trace[1]);
+    UUIDHelpers::getHighBytes(dst_uuid) = (UUIDHelpers::getHighBytes(dst_uuid) & (0xffffffffffffffffull - trace[0])) | (UUIDHelpers::getHighBytes(src_uuid) & trace[0]);
 }

 class FixedStringModel : public IModel
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -1105,6 +1105,69 @@ try
    if (config().has("macros"))
        global_context->setMacros(std::make_unique<Macros>(config(), "macros", log));

+    /// Set up caches.
+
+    const size_t max_cache_size = static_cast<size_t>(physical_server_memory * server_settings.cache_size_to_ram_max_ratio);
+
+    String uncompressed_cache_policy = server_settings.uncompressed_cache_policy;
+    size_t uncompressed_cache_size = server_settings.uncompressed_cache_size;
+    if (uncompressed_cache_size > max_cache_size)
+    {
+        uncompressed_cache_size = max_cache_size;
+        LOG_INFO(log, "Lowered uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
+    }
+    global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size);
+
+    String mark_cache_policy = server_settings.mark_cache_policy;
+    size_t mark_cache_size = server_settings.mark_cache_size;
+    if (mark_cache_size > max_cache_size)
+    {
+        mark_cache_size = max_cache_size;
+        LOG_INFO(log, "Lowered mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(mark_cache_size));
+    }
+    global_context->setMarkCache(mark_cache_policy, mark_cache_size);
+
+    size_t index_uncompressed_cache_size = server_settings.index_uncompressed_cache_size;
+    if (index_uncompressed_cache_size > max_cache_size)
+    {
+        index_uncompressed_cache_size = max_cache_size;
+        LOG_INFO(log, "Lowered index uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
+    }
+    global_context->setIndexUncompressedCache(index_uncompressed_cache_size);
+
+    size_t index_mark_cache_size = server_settings.index_mark_cache_size;
+    if (index_mark_cache_size > max_cache_size)
+    {
+        index_mark_cache_size = max_cache_size;
+        LOG_INFO(log, "Lowered index mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
+    }
+    global_context->setIndexMarkCache(index_mark_cache_size);
+
+    size_t mmap_cache_size = server_settings.mmap_cache_size;
+    if (mmap_cache_size > max_cache_size)
+    {
+        mmap_cache_size = max_cache_size;
+        LOG_INFO(log, "Lowered mmap file cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
+    }
+    global_context->setMMappedFileCache(mmap_cache_size);
+
+    size_t query_cache_max_size_in_bytes = config().getUInt64("query_cache.max_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_SIZE);
+    size_t query_cache_max_entries = config().getUInt64("query_cache.max_entries", DEFAULT_QUERY_CACHE_MAX_ENTRIES);
+    size_t query_cache_query_cache_max_entry_size_in_bytes = config().getUInt64("query_cache.max_entry_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_BYTES);
+    size_t query_cache_max_entry_size_in_rows = config().getUInt64("query_cache.max_entry_rows_in_rows", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_ROWS);
+    if (query_cache_max_size_in_bytes > max_cache_size)
+    {
+        query_cache_max_size_in_bytes = max_cache_size;
+        LOG_INFO(log, "Lowered query cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
+    }
+    global_context->setQueryCache(query_cache_max_size_in_bytes, query_cache_max_entries, query_cache_query_cache_max_entry_size_in_bytes, query_cache_max_entry_size_in_rows);
+
+#if USE_EMBEDDED_COMPILER
+    size_t compiled_expression_cache_max_size_in_bytes = config().getUInt64("compiled_expression_cache_size", DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_SIZE);
+    size_t compiled_expression_cache_max_elements = config().getUInt64("compiled_expression_cache_elements_size", DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_ENTRIES);
+    CompiledExpressionCacheFactory::instance().init(compiled_expression_cache_max_size_in_bytes, compiled_expression_cache_max_elements);
+#endif
+
    /// Initialize main config reloader.
    std::string include_from_path = config().getString("include_from", "/etc/metrika.xml");

@ -1324,7 +1387,14 @@ try

            global_context->updateStorageConfiguration(*config);
            global_context->updateInterserverCredentials(*config);
+
+            global_context->updateUncompressedCacheConfiguration(*config);
+            global_context->updateMarkCacheConfiguration(*config);
+            global_context->updateIndexUncompressedCacheConfiguration(*config);
+            global_context->updateIndexMarkCacheConfiguration(*config);
+            global_context->updateMMappedFileCacheConfiguration(*config);
            global_context->updateQueryCacheConfiguration(*config);
+
            CompressionCodecEncrypted::Configuration::instance().tryLoad(*config, "encryption_codecs");
 #if USE_SSL
            CertificateReloader::instance().tryLoad(*config);
@ -1484,19 +1554,6 @@ try
    /// Limit on total number of concurrently executed queries.
    global_context->getProcessList().setMaxSize(server_settings.max_concurrent_queries);

-    /// Set up caches.
-
-    const size_t max_cache_size = static_cast<size_t>(physical_server_memory * server_settings.cache_size_to_ram_max_ratio);
-
-    String uncompressed_cache_policy = server_settings.uncompressed_cache_policy;
-    size_t uncompressed_cache_size = server_settings.uncompressed_cache_size;
-    if (uncompressed_cache_size > max_cache_size)
-    {
-        uncompressed_cache_size = max_cache_size;
-        LOG_INFO(log, "Lowered uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
-    }
-    global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size);
-
    /// Load global settings from default_profile and system_profile.
    global_context->setDefaultProfiles(config());

@ -1512,61 +1569,6 @@ try
            server_settings.async_insert_queue_flush_on_shutdown));
    }

-    String mark_cache_policy = server_settings.mark_cache_policy;
-    size_t mark_cache_size = server_settings.mark_cache_size;
-    if (!mark_cache_size)
-        LOG_ERROR(log, "Too low mark cache size will lead to severe performance degradation.");
-    if (mark_cache_size > max_cache_size)
-    {
-        mark_cache_size = max_cache_size;
-        LOG_INFO(log, "Lowered mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(mark_cache_size));
-    }
-    global_context->setMarkCache(mark_cache_policy, mark_cache_size);
-
-    size_t index_uncompressed_cache_size = server_settings.index_uncompressed_cache_size;
-    if (index_uncompressed_cache_size > max_cache_size)
-    {
-        index_uncompressed_cache_size = max_cache_size;
-        LOG_INFO(log, "Lowered index uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
-    }
-    if (index_uncompressed_cache_size)
-        global_context->setIndexUncompressedCache(server_settings.index_uncompressed_cache_size);
-
-    size_t index_mark_cache_size = server_settings.index_mark_cache_size;
-    if (index_mark_cache_size > max_cache_size)
-    {
-        index_mark_cache_size = max_cache_size;
-        LOG_INFO(log, "Lowered index mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
-    }
-    if (index_mark_cache_size)
-        global_context->setIndexMarkCache(server_settings.index_mark_cache_size);
-
-    size_t mmap_cache_size = server_settings.mmap_cache_size;
-    if (mmap_cache_size > max_cache_size)
-    {
-        mmap_cache_size = max_cache_size;
-        LOG_INFO(log, "Lowered mmap file cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
-    }
-    if (mmap_cache_size)
-        global_context->setMMappedFileCache(server_settings.mmap_cache_size);
-
-    size_t query_cache_max_size_in_bytes = config().getUInt64("query_cache.max_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_SIZE);
-    size_t query_cache_max_entries = config().getUInt64("query_cache.max_entries", DEFAULT_QUERY_CACHE_MAX_ENTRIES);
-    size_t query_cache_query_cache_max_entry_size_in_bytes = config().getUInt64("query_cache.max_entry_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_BYTES);
-    size_t query_cache_max_entry_size_in_rows = config().getUInt64("query_cache.max_entry_rows_in_rows", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_ROWS);
-    if (query_cache_max_size_in_bytes > max_cache_size)
-    {
-        query_cache_max_size_in_bytes = max_cache_size;
-        LOG_INFO(log, "Lowered query cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
-    }
-    global_context->setQueryCache(query_cache_max_size_in_bytes, query_cache_max_entries, query_cache_query_cache_max_entry_size_in_bytes, query_cache_max_entry_size_in_rows);
-
-#if USE_EMBEDDED_COMPILER
-    size_t compiled_expression_cache_max_size_in_bytes = config().getUInt64("compiled_expression_cache_size", DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_SIZE);
-    size_t compiled_expression_cache_max_elements = config().getUInt64("compiled_expression_cache_elements_size", DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_ENTRIES);
-    CompiledExpressionCacheFactory::instance().init(compiled_expression_cache_max_size_in_bytes, compiled_expression_cache_max_elements);
-#endif
-
    /// Set path for format schema files
    fs::path format_schema_path(config().getString("format_schema_path", path / "format_schemas/"));
    global_context->setFormatSchemaPath(format_schema_path);
@ -2072,6 +2074,9 @@ void Server::createServers(

    for (const auto & protocol : protocols)
    {
+        if (!server_type.shouldStart(ServerType::Type::CUSTOM, protocol))
+            continue;
+
        std::string prefix = "protocols." + protocol + ".";
        std::string port_name = prefix + "port";
        std::string description {"<undefined> protocol"};
@ -2081,9 +2086,6 @@ void Server::createServers(
        if (!config.has(prefix + "port"))
            continue;

-        if (!server_type.shouldStart(ServerType::Type::CUSTOM, port_name))
-            continue;
-
        std::vector<std::string> hosts;
        if (config.has(prefix + "host"))
            hosts.push_back(config.getString(prefix + "host"));
--- a/src/Access/MultipleAccessStorage.cpp
+++ b/src/Access/MultipleAccessStorage.cpp
@ -46,7 +46,7 @@ void MultipleAccessStorage::setStorages(const std::vector<StoragePtr> & storages
 {
    std::lock_guard lock{mutex};
    nested_storages = std::make_shared<const Storages>(storages);
-    ids_cache.reset();
+    ids_cache.clear();
 }

 void MultipleAccessStorage::addStorage(const StoragePtr & new_storage)
@ -69,7 +69,7 @@ void MultipleAccessStorage::removeStorage(const StoragePtr & storage_to_remove)
    auto new_storages = std::make_shared<Storages>(*nested_storages);
    new_storages->erase(new_storages->begin() + index);
    nested_storages = new_storages;
-    ids_cache.reset();
+    ids_cache.clear();
 }

 std::vector<StoragePtr> MultipleAccessStorage::getStorages()
--- a/src/Access/UsersConfigAccessStorage.cpp
+++ b/src/Access/UsersConfigAccessStorage.cpp
@ -11,6 +11,7 @@
 #include <Common/Config/ConfigReloader.h>
 #include <Common/StringUtils/StringUtils.h>
 #include <Common/quoteString.h>
+#include <Common/TransformEndianness.hpp>
 #include <Core/Settings.h>
 #include <Interpreters/executeQuery.h>
 #include <Parsers/Access/ASTGrantQuery.h>
@ -49,6 +50,7 @@ namespace
        md5.update(type_storage_chars, strlen(type_storage_chars));
        UUID result;
        memcpy(&result, md5.digest().data(), md5.digestLength());
+        transformEndianness<std::endian::native, std::endian::little>(result);
        return result;
    }

--- a/src/AggregateFunctions/AggregateFunctionAvg.h
+++ b/src/AggregateFunctions/AggregateFunctionAvg.h
@ -109,7 +109,7 @@ public:

    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
    {
-        writeBinary(this->data(place).numerator, buf);
+        writeBinaryLittleEndian(this->data(place).numerator, buf);

        if constexpr (std::is_unsigned_v<Denominator>)
            writeVarUInt(this->data(place).denominator, buf);
@ -119,7 +119,7 @@ public:

    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
    {
-        readBinary(this->data(place).numerator, buf);
+        readBinaryLittleEndian(this->data(place).numerator, buf);

        if constexpr (std::is_unsigned_v<Denominator>)
            readVarUInt(this->data(place).denominator, buf);
--- a/src/AggregateFunctions/AggregateFunctionBoundingRatio.h
+++ b/src/AggregateFunctions/AggregateFunctionBoundingRatio.h
@ -100,6 +100,17 @@ void AggregateFunctionBoundingRatioData::deserialize(ReadBuffer & buf)
    }
 }

+inline void writeBinary(const AggregateFunctionBoundingRatioData::Point & p, WriteBuffer & buf)
+{
+    writePODBinary(p, buf);
+}
+
+inline void readBinary(AggregateFunctionBoundingRatioData::Point & p, ReadBuffer & buf)
+{
+    readPODBinary(p, buf);
+}
+
+
 class AggregateFunctionBoundingRatio final : public IAggregateFunctionDataHelper<AggregateFunctionBoundingRatioData, AggregateFunctionBoundingRatio>
 {
 private:
--- a/src/AggregateFunctions/CMakeLists.txt
+++ b/src/AggregateFunctions/CMakeLists.txt
@ -1,28 +1,26 @@
 include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake")
 add_headers_and_sources(clickhouse_aggregate_functions .)

-list(REMOVE_ITEM clickhouse_aggregate_functions_sources
+extract_into_parent_list(clickhouse_aggregate_functions_sources dbms_sources
    IAggregateFunction.cpp
    AggregateFunctionFactory.cpp
    AggregateFunctionCombinatorFactory.cpp
-    AggregateFunctionCount.cpp
    AggregateFunctionState.cpp
+    AggregateFunctionCount.cpp
    parseAggregateFunctionParameters.cpp
-    FactoryHelpers.cpp
 )
-
-list(REMOVE_ITEM clickhouse_aggregate_functions_headers
+extract_into_parent_list(clickhouse_aggregate_functions_headers dbms_headers
    IAggregateFunction.h
    IAggregateFunctionCombinator.h
    AggregateFunctionFactory.h
    AggregateFunctionCombinatorFactory.h
-    AggregateFunctionCount.h
    AggregateFunctionState.h
-    parseAggregateFunctionParameters.h
+    AggregateFunctionCount.cpp
    FactoryHelpers.h
+    parseAggregateFunctionParameters.h
 )

-add_library(clickhouse_aggregate_functions ${clickhouse_aggregate_functions_sources})
+add_library(clickhouse_aggregate_functions ${clickhouse_aggregate_functions_headers} ${clickhouse_aggregate_functions_sources})
 target_link_libraries(clickhouse_aggregate_functions PRIVATE dbms PUBLIC ch_contrib::cityhash)

 if(ENABLE_EXAMPLES)
--- a/src/AggregateFunctions/QuantileTiming.h
+++ b/src/AggregateFunctions/QuantileTiming.h
@ -783,6 +783,16 @@ public:
            for (size_t i = 0; i < size; ++i)
                result[i] = std::numeric_limits<float>::quiet_NaN();
    }
+
+    friend void writeBinary(const Kind & x, WriteBuffer & buf)
+    {
+        writePODBinary(x, buf);
+    }
+
+    friend void readBinary(Kind & x, ReadBuffer & buf)
+    {
+        readPODBinary(x, buf);
+    }
 };

 #undef SMALL_THRESHOLD
--- a/src/AggregateFunctions/ReservoirSamplerDeterministic.h
+++ b/src/AggregateFunctions/ReservoirSamplerDeterministic.h
@ -276,3 +276,12 @@ private:
            return NanLikeValueConstructor<ResultType, std::is_floating_point_v<ResultType>>::getValue();
    }
 };
+
+namespace DB
+{
+template <typename T>
+void readBinary(std::pair<T, UInt32> & x, ReadBuffer & buf)
+{
+    readPODBinary(x, buf);
+}
+}
--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
@ -563,8 +563,13 @@ void BackupsWorker::writeBackupEntries(BackupMutablePtr backup, BackupEntries &&
            }
        };

-        if (always_single_threaded || !backups_thread_pool->trySchedule([job] { job(true); }))
+        if (always_single_threaded)
+        {
            job(false);
+            continue;
+        }
+
+        backups_thread_pool->scheduleOrThrowOnError([job] { job(true); });
    }

    {
@ -854,8 +859,7 @@ void BackupsWorker::restoreTablesData(const OperationID & restore_id, BackupPtr
            }
        };

-        if (!thread_pool.trySchedule([job] { job(true); }))
-            job(false);
+        thread_pool.scheduleOrThrowOnError([job] { job(true); });
    }

    {
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -49,6 +49,8 @@ else()
    add_definitions(-DENABLE_MULTITARGET_CODE=0)
 endif()

+set(dbms_headers)
+set(dbms_sources)

 add_subdirectory (Access)
 add_subdirectory (Backups)
@ -78,10 +80,6 @@ add_subdirectory (Daemon)
 add_subdirectory (Loggers)
 add_subdirectory (Formats)

-
-set(dbms_headers)
-set(dbms_sources)
-
 add_headers_and_sources(clickhouse_common_io Common)
 add_headers_and_sources(clickhouse_common_io Common/HashTable)
 add_headers_and_sources(clickhouse_common_io IO)
@ -151,47 +149,7 @@ else()
    message(STATUS "StorageFileLog is only supported on Linux")
 endif ()

-list (APPEND clickhouse_common_io_sources ${CONFIG_INCLUDE_PATH}/config_version.cpp)
-
-list (APPEND dbms_sources Functions/IFunction.cpp Functions/FunctionFactory.cpp Functions/FunctionHelpers.cpp Functions/extractTimeZoneFromFunctionArguments.cpp Functions/FunctionsLogical.cpp Functions/indexHint.cpp)
-list (APPEND dbms_headers Functions/IFunction.h Functions/FunctionFactory.h Functions/FunctionHelpers.h Functions/extractTimeZoneFromFunctionArguments.h Functions/FunctionsLogical.h Functions/indexHint.h)
-
-list (APPEND dbms_sources
-    AggregateFunctions/IAggregateFunction.cpp
-    AggregateFunctions/AggregateFunctionFactory.cpp
-    AggregateFunctions/AggregateFunctionCombinatorFactory.cpp
-    AggregateFunctions/AggregateFunctionState.cpp
-    AggregateFunctions/AggregateFunctionCount.cpp
-    AggregateFunctions/parseAggregateFunctionParameters.cpp)
-list (APPEND dbms_headers
-    AggregateFunctions/IAggregateFunction.h
-    AggregateFunctions/IAggregateFunctionCombinator.h
-    AggregateFunctions/AggregateFunctionFactory.h
-    AggregateFunctions/AggregateFunctionCombinatorFactory.h
-    AggregateFunctions/AggregateFunctionState.h
-    AggregateFunctions/AggregateFunctionCount.cpp
-    AggregateFunctions/FactoryHelpers.h
-    AggregateFunctions/parseAggregateFunctionParameters.h)
-
-list (APPEND dbms_sources
-    TableFunctions/ITableFunction.cpp
-    TableFunctions/TableFunctionView.cpp
-    TableFunctions/TableFunctionFactory.cpp)
-list (APPEND dbms_headers
-    TableFunctions/ITableFunction.h
-    TableFunctions/TableFunctionView.h
-    TableFunctions/TableFunctionFactory.h)
-
-list (APPEND dbms_sources
-    Dictionaries/DictionaryFactory.cpp
-    Dictionaries/DictionarySourceFactory.cpp
-    Dictionaries/DictionaryStructure.cpp
-    Dictionaries/getDictionaryConfigurationFromAST.cpp)
-list (APPEND dbms_headers
-    Dictionaries/DictionaryFactory.h
-    Dictionaries/DictionarySourceFactory.h
-    Dictionaries/DictionaryStructure.h
-    Dictionaries/getDictionaryConfigurationFromAST.h)
+list(APPEND clickhouse_common_io_sources ${CONFIG_INCLUDE_PATH}/config_version.cpp)

 if (NOT ENABLE_SSL)
    list (REMOVE_ITEM clickhouse_common_io_sources Common/OpenSSLHelpers.cpp)
@ -599,6 +557,10 @@ if (TARGET ch_contrib::annoy)
    dbms_target_link_libraries(PUBLIC ch_contrib::annoy)
 endif()

+if (TARGET ch_contrib::usearch)
+    dbms_target_link_libraries(PUBLIC ch_contrib::usearch)
+endif()
+
 if (TARGET ch_rust::skim)
    dbms_target_include_directories(PRIVATE $<TARGET_PROPERTY:ch_rust::skim,INTERFACE_INCLUDE_DIRECTORIES>)
    dbms_target_link_libraries(PUBLIC ch_rust::skim)
--- a/src/Columns/ColumnNullable.cpp
+++ b/src/Columns/ColumnNullable.cpp
@ -865,10 +865,14 @@ ColumnPtr ColumnNullable::getNestedColumnWithDefaultOnNull() const
        if (next_null_index != start)
            res->insertRangeFrom(*nested_column, start, next_null_index - start);

-        if (next_null_index < end)
-            res->insertDefault();
+        size_t next_none_null_index = next_null_index;
+        while (next_none_null_index < end && null_map_data[next_none_null_index])
+            ++next_none_null_index;

-        start = next_null_index + 1;
+        if (next_null_index != next_none_null_index)
+            res->insertManyDefaults(next_none_null_index - next_null_index);
+
+        start = next_none_null_index;
    }
    return res;
 }
--- a/src/Common/CacheBase.h
+++ b/src/Common/CacheBase.h
@ -151,7 +151,7 @@ public:
        std::lock_guard cache_lock(mutex);

        /// Insert the new value only if the token is still in present in insert_tokens.
-        /// (The token may be absent because of a concurrent reset() call).
+        /// (The token may be absent because of a concurrent clear() call).
        bool result = false;
        auto token_it = insert_tokens.find(key);
        if (token_it != insert_tokens.end() && token_it->second.get() == token)
@ -179,13 +179,13 @@ public:
        return cache_policy->dump();
    }

-    void reset()
+    void clear()
    {
        std::lock_guard lock(mutex);
        insert_tokens.clear();
        hits = 0;
        misses = 0;
-        cache_policy->reset(lock);
+        cache_policy->clear(lock);
    }

    void remove(const Key & key)
--- a/src/Common/DNSResolver.cpp
+++ b/src/Common/DNSResolver.cpp
@ -270,8 +270,8 @@ std::unordered_set<String> DNSResolver::reverseResolve(const Poco::Net::IPAddres

 void DNSResolver::dropCache()
 {
-    impl->cache_host.reset();
-    impl->cache_address.reset();
+    impl->cache_host.clear();
+    impl->cache_address.clear();

    std::scoped_lock lock(impl->update_mutex, impl->drop_mutex);

--- a/src/Common/FailPoint.cpp
+++ b/src/Common/FailPoint.cpp
@ -33,6 +33,7 @@ static struct InitFiu

 #define APPLY_FOR_FAILPOINTS(ONCE, REGULAR, PAUSEABLE_ONCE, PAUSEABLE) \
    ONCE(replicated_merge_tree_commit_zk_fail_after_op) \
+    REGULAR(use_delayed_remote_source) \
    REGULAR(dummy_failpoint) \
    PAUSEABLE_ONCE(dummy_pausable_failpoint_once) \
    PAUSEABLE(dummy_pausable_failpoint)
--- a/src/Common/FieldVisitorDump.cpp
+++ b/src/Common/FieldVisitorDump.cpp
@ -20,7 +20,7 @@ template <typename T>
 static inline void writeQuoted(const DecimalField<T> & x, WriteBuffer & buf)
 {
    writeChar('\'', buf);
-    writeText(x.getValue(), x.getScale(), buf, {});
+    writeText(x.getValue(), x.getScale(), buf, /* trailing_zeros */ true);
    writeChar('\'', buf);
 }

--- a/src/Common/HashTable/Hash.h
+++ b/src/Common/HashTable/Hash.h
@ -2,9 +2,10 @@

 #include <city.h>
 #include <Core/Types.h>
+#include <Core/UUID.h>
+#include <base/StringRef.h>
 #include <base/types.h>
 #include <base/unaligned.h>
-#include <base/StringRef.h>

 #include <type_traits>

@ -406,7 +407,7 @@ struct UInt128TrivialHash

 struct UUIDTrivialHash
 {
-    size_t operator()(DB::UUID x) const { return x.toUnderType().items[0]; }
+    size_t operator()(DB::UUID x) const { return DB::UUIDHelpers::getHighBytes(x); }
 };

 struct UInt256Hash
--- a/src/Common/HashTable/HashTable.h
+++ b/src/Common/HashTable/HashTable.h
@ -201,11 +201,11 @@ struct HashTableCell
    void setMapped(const value_type & /*value*/) {}

    /// Serialization, in binary and text form.
-    void write(DB::WriteBuffer & wb) const         { DB::writeBinary(key, wb); }
+    void write(DB::WriteBuffer & wb) const         { DB::writeBinaryLittleEndian(key, wb); }
    void writeText(DB::WriteBuffer & wb) const     { DB::writeDoubleQuoted(key, wb); }

    /// Deserialization, in binary and text form.
-    void read(DB::ReadBuffer & rb)        { DB::readBinary(key, rb); }
+    void read(DB::ReadBuffer & rb)        { DB::readBinaryLittleEndian(key, rb); }
    void readText(DB::ReadBuffer & rb)    { DB::readDoubleQuoted(key, rb); }

    /// When cell pointer is moved during erase, reinsert or resize operations
--- a/src/Common/ICachePolicy.h
+++ b/src/Common/ICachePolicy.h
@ -10,11 +10,6 @@
 namespace DB
 {

-namespace ErrorCodes
-{
-    extern const int NOT_IMPLEMENTED;
-}
-
 template <typename T>
 struct EqualWeightFunction
 {
@ -46,8 +41,8 @@ public:
    virtual size_t count(std::lock_guard<std::mutex> & /*cache_lock*/) const = 0;
    virtual size_t maxSize(std::lock_guard<std::mutex>& /*cache_lock*/) const = 0;

-    virtual void setMaxCount(size_t /*max_count*/, std::lock_guard<std::mutex> & /* cache_lock */) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for cache policy"); }
-    virtual void setMaxSize(size_t /*max_size_in_bytes*/, std::lock_guard<std::mutex> & /* cache_lock */) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for cache policy"); }
+    virtual void setMaxCount(size_t /*max_count*/, std::lock_guard<std::mutex> & /* cache_lock */) = 0;
+    virtual void setMaxSize(size_t /*max_size_in_bytes*/, std::lock_guard<std::mutex> & /* cache_lock */) = 0;
    virtual void setQuotaForUser(const String & user_name, size_t max_size_in_bytes, size_t max_entries, std::lock_guard<std::mutex> & /*cache_lock*/) { user_quotas->setQuotaForUser(user_name, max_size_in_bytes, max_entries); }

    /// HashFunction usually hashes the entire key and the found key will be equal the provided key. In such cases, use get(). It is also
@ -60,7 +55,7 @@ public:

    virtual void remove(const Key & key, std::lock_guard<std::mutex> & /*cache_lock*/) = 0;

-    virtual void reset(std::lock_guard<std::mutex> & /*cache_lock*/) = 0;
+    virtual void clear(std::lock_guard<std::mutex> & /*cache_lock*/) = 0;
    virtual std::vector<KeyMapped> dump() const = 0;

 protected:
--- a/src/Common/LRUCachePolicy.h
+++ b/src/Common/LRUCachePolicy.h
@ -7,9 +7,8 @@

 namespace DB
 {
-/// Cache policy LRU evicts entries which are not used for a long time.
-/// WeightFunction is a functor that takes Mapped as a parameter and returns "weight" (approximate size)
-/// of that value.
+/// Cache policy LRU evicts entries which are not used for a long time. Also see cache policy SLRU for reference.
+/// WeightFunction is a functor that takes Mapped as a parameter and returns "weight" (approximate size) of that value.
 /// Cache starts to evict entries when their total weight exceeds max_size_in_bytes.
 /// Value weight should not change after insertion.
 /// To work with the thread-safe implementation of this class use a class "CacheBase" with first parameter "LRU"
@ -24,11 +23,12 @@ public:
    using typename Base::OnWeightLossFunction;

    /** Initialize LRUCachePolicy with max_size_in_bytes and max_count.
+     *  max_size_in_bytes == 0 means the cache accepts no entries.
      * max_count == 0 means no elements size restrictions.
      */
    LRUCachePolicy(size_t max_size_in_bytes_, size_t max_count_, OnWeightLossFunction on_weight_loss_function_)
        : Base(std::make_unique<NoCachePolicyUserQuota>())
-        , max_size_in_bytes(std::max(1uz, max_size_in_bytes_))
+        , max_size_in_bytes(max_size_in_bytes_)
        , max_count(max_count_)
        , on_weight_loss_function(on_weight_loss_function_)
    {
@ -49,7 +49,19 @@ public:
        return max_size_in_bytes;
    }

-    void reset(std::lock_guard<std::mutex> & /* cache_lock */) override
+    void setMaxCount(size_t max_count_, std::lock_guard<std::mutex> & /* cache_lock */) override
+    {
+        max_count = max_count_;
+        removeOverflow();
+    }
+
+    void setMaxSize(size_t max_size_in_bytes_, std::lock_guard<std::mutex> & /* cache_lock */) override
+    {
+        max_size_in_bytes = max_size_in_bytes_;
+        removeOverflow();
+    }
+
+    void clear(std::lock_guard<std::mutex> & /* cache_lock */) override
    {
        queue.clear();
        cells.clear();
@ -155,8 +167,8 @@ private:

    /// Total weight of values.
    size_t current_size_in_bytes = 0;
-    const size_t max_size_in_bytes;
-    const size_t max_count;
+    size_t max_size_in_bytes;
+    size_t max_count;

    WeightFunction weight_function;
    OnWeightLossFunction on_weight_loss_function;
@ -172,10 +184,7 @@ private:

            auto it = cells.find(key);
            if (it == cells.end())
-            {
-                // Queue became inconsistent
-                abort();
-            }
+                std::terminate(); // Queue became inconsistent

            const auto & cell = it->second;

@ -190,10 +199,7 @@ private:
        on_weight_loss_function(current_weight_lost);

        if (current_size_in_bytes > (1ull << 63))
-        {
-            // Queue became inconsistent
-            abort();
-        }
+            std::terminate(); // Queue became inconsistent
    }
 };

--- a/src/Common/OpenTelemetryTraceContext.cpp
+++ b/src/Common/OpenTelemetryTraceContext.cpp
@ -5,6 +5,7 @@
 #include <Common/Exception.h>
 #include <base/hex.h>
 #include <Core/Settings.h>
+#include <Core/UUID.h>
 #include <IO/ReadHelpers.h>
 #include <IO/WriteHelpers.h>

@ -227,8 +228,8 @@ bool TracingContext::parseTraceparentHeader(std::string_view traceparent, String

    ++data;
    this->trace_flags = unhex2(data);
-    this->trace_id.toUnderType().items[0] = trace_id_higher_64;
-    this->trace_id.toUnderType().items[1] = trace_id_lower_64;
+    UUIDHelpers::getHighBytes(this->trace_id) = trace_id_higher_64;
+    UUIDHelpers::getLowBytes(this->trace_id) = trace_id_lower_64;
    this->span_id = span_id_64;
    return true;
 }
@ -239,8 +240,8 @@ String TracingContext::composeTraceparentHeader() const
    // parent id.
    return fmt::format(
        "00-{:016x}{:016x}-{:016x}-{:02x}",
-        trace_id.toUnderType().items[0],
-        trace_id.toUnderType().items[1],
+        UUIDHelpers::getHighBytes(trace_id),
+        UUIDHelpers::getLowBytes(trace_id),
        span_id,
        // This cast is needed because fmt is being weird and complaining that
        // "mixing character types is not allowed".
@ -335,8 +336,8 @@ TracingContextHolder::TracingContextHolder(
            while (_parent_trace_context.trace_id == UUID())
            {
                // Make sure the random generated trace_id is not 0 which is an invalid id.
-                _parent_trace_context.trace_id.toUnderType().items[0] = thread_local_rng();
-                _parent_trace_context.trace_id.toUnderType().items[1] = thread_local_rng();
+                UUIDHelpers::getHighBytes(_parent_trace_context.trace_id) = thread_local_rng();
+                UUIDHelpers::getLowBytes(_parent_trace_context.trace_id) = thread_local_rng();
            }
            _parent_trace_context.span_id = 0;
        }
--- a/src/Common/SLRUCachePolicy.h
+++ b/src/Common/SLRUCachePolicy.h
@ -9,9 +9,8 @@ namespace DB
 {

 /// Cache policy SLRU evicts entries which were used only once and are not used for a long time,
-/// this policy protects entries which were used more then once from a sequential scan.
-/// WeightFunction is a functor that takes Mapped as a parameter and returns "weight" (approximate size)
-/// of that value.
+/// this policy protects entries which were used more then once from a sequential scan. Also see cache policy LRU for reference.
+/// WeightFunction is a functor that takes Mapped as a parameter and returns "weight" (approximate size) of that value.
 /// Cache starts to evict entries when their total weight exceeds max_size_in_bytes.
 /// Value weight should not change after insertion.
 /// To work with the thread-safe implementation of this class use a class "CacheBase" with first parameter "SLRU"
@ -30,8 +29,9 @@ public:
      * max_protected_size == 0 means that the default protected size is equal to half of the total max size.
      */
    /// TODO: construct from special struct with cache policy parameters (also with max_protected_size).
-    SLRUCachePolicy(size_t max_size_in_bytes_, size_t max_count_, double size_ratio, OnWeightLossFunction on_weight_loss_function_)
+    SLRUCachePolicy(size_t max_size_in_bytes_, size_t max_count_, double size_ratio_, OnWeightLossFunction on_weight_loss_function_)
        : Base(std::make_unique<NoCachePolicyUserQuota>())
+        , size_ratio(size_ratio_)
        , max_protected_size(static_cast<size_t>(max_size_in_bytes_ * std::min(1.0, size_ratio)))
        , max_size_in_bytes(max_size_in_bytes_)
        , max_count(max_count_)
@ -54,7 +54,22 @@ public:
        return max_size_in_bytes;
    }

-    void reset(std::lock_guard<std::mutex> & /* cache_lock */) override
+    void setMaxCount(size_t max_count_, std::lock_guard<std::mutex> & /* cache_lock */) override
+    {
+        max_count = max_count_;
+        removeOverflow(protected_queue, max_protected_size, current_protected_size, /*is_protected=*/true);
+        removeOverflow(probationary_queue, max_size_in_bytes, current_size_in_bytes, /*is_protected=*/false);
+    }
+
+    void setMaxSize(size_t max_size_in_bytes_, std::lock_guard<std::mutex> & /* cache_lock */) override
+    {
+        max_protected_size = static_cast<size_t>(max_size_in_bytes_ * std::min(1.0, size_ratio));
+        max_size_in_bytes = max_size_in_bytes_;
+        removeOverflow(protected_queue, max_protected_size, current_protected_size, /*is_protected=*/true);
+        removeOverflow(probationary_queue, max_size_in_bytes, current_size_in_bytes, /*is_protected=*/false);
+    }
+
+    void clear(std::lock_guard<std::mutex> & /* cache_lock */) override
    {
        cells.clear();
        probationary_queue.clear();
@ -68,12 +83,13 @@ public:
        auto it = cells.find(key);
        if (it == cells.end())
            return;
+
        auto & cell = it->second;
+
        current_size_in_bytes -= cell.size;
        if (cell.is_protected)
-        {
            current_protected_size -= cell.size;
-        }
+
        auto & queue = cell.is_protected ? protected_queue : probationary_queue;
        queue.erase(cell.queue_iterator);
        cells.erase(it);
@ -192,16 +208,17 @@ private:

    Cells cells;

+    const double size_ratio;
    size_t current_protected_size = 0;
    size_t current_size_in_bytes = 0;
-    const size_t max_protected_size;
-    const size_t max_size_in_bytes;
-    const size_t max_count;
+    size_t max_protected_size;
+    size_t max_size_in_bytes;
+    size_t max_count;

    WeightFunction weight_function;
    OnWeightLossFunction on_weight_loss_function;

-    void removeOverflow(SLRUQueue & queue, const size_t max_weight_size, size_t & current_weight_size, bool is_protected)
+    void removeOverflow(SLRUQueue & queue, size_t max_weight_size, size_t & current_weight_size, bool is_protected)
    {
        size_t current_weight_lost = 0;
        size_t queue_size = queue.size();
@ -223,8 +240,7 @@ private:
        {
            need_remove = [&]()
            {
-                return ((max_count != 0 && cells.size() > max_count)
-                || (current_weight_size > max_weight_size)) && (queue_size > 0);
+                return ((max_count != 0 && cells.size() > max_count) || (current_weight_size > max_weight_size)) && (queue_size > 0);
            };
        }

@ -234,10 +250,7 @@ private:

            auto it = cells.find(key);
            if (it == cells.end())
-            {
-                // Queue became inconsistent
-                abort();
-            }
+                std::terminate(); // Queue became inconsistent

            auto & cell = it->second;

@ -262,10 +275,7 @@ private:
            on_weight_loss_function(current_weight_lost);

        if (current_size_in_bytes > (1ull << 63))
-        {
-            // Queue became inconsistent
-            abort();
-        }
+            std::terminate(); // Queue became inconsistent
    }
 };

--- a/src/Common/TTLCachePolicy.h
+++ b/src/Common/TTLCachePolicy.h
@ -121,7 +121,7 @@ public:
        max_size_in_bytes = max_size_in_bytes_;
    }

-    void reset(std::lock_guard<std::mutex> & /* cache_lock */) override
+    void clear(std::lock_guard<std::mutex> & /* cache_lock */) override
    {
        cache.clear();
    }
--- a/src/Common/UTF8Helpers.h
+++ b/src/Common/UTF8Helpers.h
@ -2,6 +2,7 @@

 #include <optional>
 #include <base/types.h>
+#include <base/simd.h>
 #include <Common/BitHelpers.h>
 #include <Poco/UTF8Encoding.h>

@ -72,16 +73,13 @@ inline size_t countCodePoints(const UInt8 * data, size_t size)
        res += __builtin_popcount(_mm_movemask_epi8(
            _mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(data)), threshold)));
 #elif defined(__aarch64__) && defined(__ARM_NEON)
-    /// Returns a 64 bit mask of nibbles (4 bits for each byte).
-    auto get_nibble_mask
-        = [](uint8x16_t input) -> uint64_t { return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(input), 4)), 0); };
    constexpr auto bytes_sse = 16;
    const auto * src_end_sse = data + size / bytes_sse * bytes_sse;

    const auto threshold = vdupq_n_s8(0xBF);

    for (; data < src_end_sse; data += bytes_sse)
-        res += std::popcount(get_nibble_mask(vcgtq_s8(vld1q_s8(reinterpret_cast<const int8_t *>(data)), threshold)));
+        res += std::popcount(getNibbleMask(vcgtq_s8(vld1q_s8(reinterpret_cast<const int8_t *>(data)), threshold)));
    res >>= 2;
 #endif

--- a/src/Common/memcmpSmall.h
+++ b/src/Common/memcmpSmall.h
@ -4,6 +4,8 @@
 #include <bit>
 #include <cstdint>

+#include <base/simd.h>
+
 #include <Core/Defines.h>


@ -504,11 +506,6 @@ inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
 #    include <arm_neon.h>
 #      pragma clang diagnostic ignored "-Wreserved-identifier"

-inline uint64_t getNibbleMask(uint8x16_t res)
-{
-    return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(res), 4)), 0);
-}
-
 template <typename Char>
 inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size)
 {
--- a/src/Common/tests/gtest_slru_cache.cpp
+++ b/src/Common/tests/gtest_slru_cache.cpp
@ -92,7 +92,7 @@ TEST(SLRUCache, removeFromProtected)
    ASSERT_TRUE(value == nullptr);
 }

-TEST(SLRUCache, reset)
+TEST(SLRUCache, clear)
 {
    using SimpleCacheBase = DB::CacheBase<int, int>;
    auto slru_cache = SimpleCacheBase("SLRU", /*max_size_in_bytes=*/10, /*max_count=*/0, /*size_ratio*/0.5);
@ -101,7 +101,7 @@ TEST(SLRUCache, reset)

    slru_cache.set(2, std::make_shared<int>(4)); /// add to protected_queue

-    slru_cache.reset();
+    slru_cache.clear();

    auto value = slru_cache.get(1);
    ASSERT_TRUE(value == nullptr);
--- a/src/Compression/CompressionCodecDelta.cpp
+++ b/src/Compression/CompressionCodecDelta.cpp
@ -73,8 +73,8 @@ void compressDataForType(const char * source, UInt32 source_size, char * dest)
    const char * const source_end = source + source_size;
    while (source < source_end)
    {
-        T curr_src = unalignedLoad<T>(source);
-        unalignedStore<T>(dest, curr_src - prev_src);
+        T curr_src = unalignedLoadLittleEndian<T>(source);
+        unalignedStoreLittleEndian<T>(dest, curr_src - prev_src);
        prev_src = curr_src;

        source += sizeof(T);
@ -94,10 +94,10 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest,
    const char * const source_end = source + source_size;
    while (source < source_end)
    {
-        accumulator += unalignedLoad<T>(source);
+        accumulator += unalignedLoadLittleEndian<T>(source);
        if (dest + sizeof(accumulator) > output_end) [[unlikely]]
            throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress the data");
-        unalignedStore<T>(dest, accumulator);
+        unalignedStoreLittleEndian<T>(dest, accumulator);

        source += sizeof(T);
        dest += sizeof(T);
--- a/src/Core/DecimalFunctions.h
+++ b/src/Core/DecimalFunctions.h
@ -86,6 +86,37 @@ struct DataTypeDecimalTrait
    }
 };

+/// Calculates result = x * multiplier + delta.
+/// If the multiplication or the addition overflows, returns false or throws DECIMAL_OVERFLOW.
+template <typename T, bool throw_on_error>
+inline bool multiplyAdd(const T & x, const T & multiplier, const T & delta, T & result)
+{
+    T multiplied = 0;
+    if (common::mulOverflow(x, multiplier, multiplied))
+    {
+        if constexpr (throw_on_error)
+            throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Decimal math overflow");
+        return false;
+    }
+
+    if (common::addOverflow(multiplied, delta, result))
+    {
+        if constexpr (throw_on_error)
+            throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Decimal math overflow");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T>
+inline T multiplyAdd(const T & x, const T & multiplier, const T & delta)
+{
+    T res;
+    multiplyAdd<T, true>(x, multiplier, delta, res);
+    return res;
+}
+
 /** Make a decimal value from whole and fractional components with given scale multiplier.
  * where scale_multiplier = scaleMultiplier<T>(scale)
  * this is to reduce number of calls to scaleMultiplier when scale is known.
@ -104,23 +135,10 @@ inline bool decimalFromComponentsWithMultiplierImpl(
 {
    using T = typename DecimalType::NativeType;
    const auto fractional_sign = whole < 0 ? -1 : 1;
-
-    T whole_scaled = 0;
-    if (common::mulOverflow(whole, scale_multiplier, whole_scaled))
-    {
-        if constexpr (throw_on_error)
-            throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Decimal math overflow");
-        return false;
-    }
-
    T value;
-    if (common::addOverflow(whole_scaled, fractional_sign * (fractional % scale_multiplier), value))
-    {
-        if constexpr (throw_on_error)
-            throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Decimal math overflow");
+    if (!multiplyAdd<T, throw_on_error>(
+            whole, scale_multiplier, fractional_sign * (fractional % scale_multiplier), value))
        return false;
-    }
-
    result = DecimalType(value);
    return true;
 }
--- a/src/Core/Field.h
+++ b/src/Core/Field.h
@ -138,7 +138,7 @@ template <typename T> bool decimalEqual(T x, T y, UInt32 x_scale, UInt32 y_scale
 template <typename T> bool decimalLess(T x, T y, UInt32 x_scale, UInt32 y_scale);
 template <typename T> bool decimalLessOrEqual(T x, T y, UInt32 x_scale, UInt32 y_scale);

-template <typename T>
+template <is_decimal T>
 class DecimalField
 {
 public:
@ -838,7 +838,7 @@ template <> struct Field::EnumToType<Field::Types::Decimal32> { using Type = Dec
 template <> struct Field::EnumToType<Field::Types::Decimal64> { using Type = DecimalField<Decimal64>; };
 template <> struct Field::EnumToType<Field::Types::Decimal128> { using Type = DecimalField<Decimal128>; };
 template <> struct Field::EnumToType<Field::Types::Decimal256> { using Type = DecimalField<Decimal256>; };
-template <> struct Field::EnumToType<Field::Types::AggregateFunctionState> { using Type = DecimalField<AggregateFunctionStateData>; };
+template <> struct Field::EnumToType<Field::Types::AggregateFunctionState> { using Type = AggregateFunctionStateData; };
 template <> struct Field::EnumToType<Field::Types::CustomType> { using Type = CustomType; };
 template <> struct Field::EnumToType<Field::Types::Bool> { using Type = UInt64; };

--- a/src/Core/MySQL/MySQLGtid.cpp
+++ b/src/Core/MySQL/MySQLGtid.cpp
@ -174,8 +174,8 @@ String GTIDSets::toPayload() const
    for (const auto & set : sets)
    {
        // MySQL UUID is big-endian.
-        writeBinaryBigEndian(set.uuid.toUnderType().items[0], buffer);
-        writeBinaryBigEndian(set.uuid.toUnderType().items[1], buffer);
+        writeBinaryBigEndian(UUIDHelpers::getHighBytes(set.uuid), buffer);
+        writeBinaryBigEndian(UUIDHelpers::getLowBytes(set.uuid), buffer);

        UInt64 intervals_size = set.intervals.size();
        buffer.write(reinterpret_cast<const char *>(&intervals_size), 8);
--- a/src/Core/MySQL/MySQLReplication.cpp
+++ b/src/Core/MySQL/MySQLReplication.cpp
@ -940,13 +940,8 @@ namespace MySQLReplication
        payload.readStrict(reinterpret_cast<char *>(&commit_flag), 1);

        // MySQL UUID is big-endian.
-        UInt64 high = 0UL;
-        UInt64 low = 0UL;
-        readBigEndianStrict(payload, reinterpret_cast<char *>(&low), 8);
-        gtid.uuid.toUnderType().items[0] = low;
-
-        readBigEndianStrict(payload, reinterpret_cast<char *>(&high), 8);
-        gtid.uuid.toUnderType().items[1] = high;
+        readBinaryBigEndian(UUIDHelpers::getHighBytes(gtid.uuid), payload);
+        readBinaryBigEndian(UUIDHelpers::getLowBytes(gtid.uuid), payload);

        payload.readStrict(reinterpret_cast<char *>(&gtid.seq_no), 8);

--- a/src/Core/MySQL/MySQLReplication.h
+++ b/src/Core/MySQL/MySQLReplication.h
@ -33,8 +33,10 @@ namespace MySQLReplication
    inline void readBigEndianStrict(ReadBuffer & payload, char * to, size_t n)
    {
        payload.readStrict(to, n);
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
        char *start = to, *end = to + n;
        std::reverse(start, end);
+#endif
    }

    inline void readTimeFractionalPart(ReadBuffer & payload, UInt32 & factional, UInt16 meta)
--- a/src/Core/ServerSettings.h
+++ b/src/Core/ServerSettings.h
@ -39,7 +39,7 @@ namespace DB
    M(UInt64, restore_threads, 16, "The maximum number of threads to execute RESTORE requests.", 0) \
    M(Int32, max_connections, 1024, "Max server connections.", 0) \
    M(UInt32, asynchronous_metrics_update_period_s, 1, "Period in seconds for updating asynchronous metrics.", 0) \
-    M(UInt32, asynchronous_heavy_metrics_update_period_s, 120, "Period in seconds for updating asynchronous metrics.", 0) \
+    M(UInt32, asynchronous_heavy_metrics_update_period_s, 120, "Period in seconds for updating heavy asynchronous metrics.", 0) \
    M(String, default_database, "default", "Default database name.", 0) \
    M(String, tmp_policy, "", "Policy for storage with temporary data.", 0) \
    M(UInt64, max_temporary_data_on_disk_size, 0, "The maximum amount of storage that could be used for external aggregation, joins or sorting., ", 0) \
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -780,6 +780,7 @@ class IColumn;
    M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \
    M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \
    M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \
+    M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \
    M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \
    M(Int64, annoy_index_search_k_nodes, -1, "SELECT queries search up to this many nodes in Annoy indexes.", 0) \
    M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \
@ -877,8 +878,10 @@ class IColumn;
    M(Bool, input_format_orc_case_insensitive_column_matching, false, "Ignore case when matching ORC columns with CH columns.", 0) \
    M(Bool, input_format_parquet_case_insensitive_column_matching, false, "Ignore case when matching Parquet columns with CH columns.", 0) \
    M(Bool, input_format_parquet_preserve_order, false, "Avoid reordering rows when reading from Parquet files. Usually makes it much slower.", 0) \
+    M(Bool, input_format_parquet_filter_push_down, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and min/max statistics in the Parquet metadata.", 0) \
    M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
    M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \
+    M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \
    M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \
    M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \
    M(Bool, input_format_arrow_allow_missing_columns, false, "Allow missing columns while reading Arrow input formats", 0) \
--- a/src/Core/UUID.cpp
+++ b/src/Core/UUID.cpp
@ -9,10 +9,11 @@ namespace UUIDHelpers
 {
    UUID generateV4()
    {
-        UInt128 res{thread_local_rng(), thread_local_rng()};
-        res.items[0] = (res.items[0] & 0xffffffffffff0fffull) | 0x0000000000004000ull;
-        res.items[1] = (res.items[1] & 0x3fffffffffffffffull) | 0x8000000000000000ull;
-        return UUID{res};
+        UUID uuid;
+        getHighBytes(uuid) = (thread_local_rng() & 0xffffffffffff0fffull) | 0x0000000000004000ull;
+        getLowBytes(uuid) = (thread_local_rng() & 0x3fffffffffffffffull) | 0x8000000000000000ull;
+
+        return uuid;
    }
 }

--- a/src/Core/UUID.h
+++ b/src/Core/UUID.h
@ -2,6 +2,59 @@

 #include <Core/Types.h>

+/**
+ * Implementation Details
+ * ^^^^^^^^^^^^^^^^^^^^^^
+ * The underlying implementation for a UUID has it represented as a 128-bit unsigned integer. Underlying this, a wide
+ * integer with a 64-bit unsigned integer as its base is utilized. This wide integer can be interfaced with as an array
+ * to access different components of the base. For example, on a Little Endian platform, accessing at index 0 will give
+ * you the 8 higher bytes, and index 1 will give you the 8 lower bytes. On a Big Endian platform, this is reversed where
+ * index 0 will give you the 8 lower bytes, and index 1 will give you the 8 higher bytes.
+ *
+ *    uuid.toUnderType().items[0]
+ *
+ *    //  uint64_t   uint64_t
+ *    // [xxxxxxxx] [        ]
+ *
+ *    uuid.toUnderType().items[1]
+ *
+ *    //  uint64_t   uint64_t
+ *    // [        ] [xxxxxxxx]
+ *
+ * The way that data is stored in the underlying wide integer treats the data as two 64-bit chunks sequenced in the
+ * array. On a Little Endian platform, this results in the following layout
+ *
+ *    // Suppose uuid contains 61f0c404-5cb3-11e7-907b-a6006ad3dba0
+ *
+ *    uuid.toUnderType().items[0]
+ *
+ *    //  uint64_t as HEX
+ *    // [E7 11 B3 5C 04 C4 F0 61] [A0 DB D3 6A 00 A6 7B 90]
+ *    //  ^^^^^^^^^^^^^^^^^^^^^^^
+ *
+ *    uuid.toUnderType().items[1]
+ *
+ *    //  uint64_t as HEX
+ *    // [E7 11 B3 5C 04 C4 F0 61] [A0 DB D3 6A 00 A6 7B 90]
+ *    //                            ^^^^^^^^^^^^^^^^^^^^^^^
+ *
+ * while on a Big Endian platform this would be
+ *
+ *    // Suppose uuid contains 61f0c404-5cb3-11e7-907b-a6006ad3dba0
+ *
+ *    uuid.toUnderType().items[0]
+ *
+ *    //  uint64_t as HEX
+ *    // [90 7B A6 00 6A D3 DB A0] [61 F0 C4 04 5C B3 11 E7]
+ *    //  ^^^^^^^^^^^^^^^^^^^^^^^
+ *
+ *    uuid.toUnderType().items[1]
+ *
+ *    //  uint64_t as HEX
+ *    // [90 7B A6 00 6A D3 DB A0] [61 F0 C4 04 5C B3 11 E7]
+ *    //                            ^^^^^^^^^^^^^^^^^^^^^^^
+*/
+

 namespace DB
 {
@ -11,6 +64,29 @@ namespace UUIDHelpers
    /// Generate random UUID.
    UUID generateV4();

+    constexpr size_t HighBytes = (std::endian::native == std::endian::little) ? 0 : 1;
+    constexpr size_t LowBytes = (std::endian::native == std::endian::little) ? 1 : 0;
+
+    inline uint64_t getHighBytes(const UUID & uuid)
+    {
+        return uuid.toUnderType().items[HighBytes];
+    }
+
+    inline uint64_t & getHighBytes(UUID & uuid)
+    {
+        return uuid.toUnderType().items[HighBytes];
+    }
+
+    inline uint64_t getLowBytes(const UUID & uuid)
+    {
+        return uuid.toUnderType().items[LowBytes];
+    }
+
+    inline uint64_t & getLowBytes(UUID & uuid)
+    {
+        return uuid.toUnderType().items[LowBytes];
+    }
+
    const UUID Nil{};
 }

--- a/src/DataTypes/DataTypeLowCardinality.h
+++ b/src/DataTypes/DataTypeLowCardinality.h
@ -46,6 +46,7 @@ public:
    bool canBeUsedInBooleanContext() const override { return dictionary_type->canBeUsedInBooleanContext(); }
    bool isValueRepresentedByNumber() const override { return dictionary_type->isValueRepresentedByNumber(); }
    bool isValueRepresentedByInteger() const override { return dictionary_type->isValueRepresentedByInteger(); }
+    bool isValueRepresentedByUnsignedInteger() const override { return dictionary_type->isValueRepresentedByUnsignedInteger(); }
    bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; }
    bool haveMaximumSizeOfValue() const override { return dictionary_type->haveMaximumSizeOfValue(); }
    size_t getMaximumSizeOfValueInMemory() const override { return dictionary_type->getMaximumSizeOfValueInMemory(); }
--- a/src/DataTypes/Serializations/SerializationUUID.cpp
+++ b/src/DataTypes/Serializations/SerializationUUID.cpp
@ -111,25 +111,25 @@ void SerializationUUID::deserializeTextCSV(IColumn & column, ReadBuffer & istr,
 void SerializationUUID::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const
 {
    UUID x = field.get<UUID>();
-    writeBinary(x, ostr);
+    writeBinaryLittleEndian(x, ostr);
 }

 void SerializationUUID::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const
 {
    UUID x;
-    readBinary(x, istr);
+    readBinaryLittleEndian(x, istr);
    field = NearestFieldType<UUID>(x);
 }

 void SerializationUUID::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const
 {
-    writeBinary(assert_cast<const ColumnVector<UUID> &>(column).getData()[row_num], ostr);
+    writeBinaryLittleEndian(assert_cast<const ColumnVector<UUID> &>(column).getData()[row_num], ostr);
 }

 void SerializationUUID::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
 {
    UUID x;
-    readBinary(x, istr);
+    readBinaryLittleEndian(x, istr);
    assert_cast<ColumnVector<UUID> &>(column).getData().push_back(x);
 }

--- a/src/Databases/DatabaseReplicated.cpp
+++ b/src/Databases/DatabaseReplicated.cpp
@ -830,6 +830,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep
        query_context->setSetting("allow_experimental_hash_functions", 1);
        query_context->setSetting("allow_experimental_object_type", 1);
        query_context->setSetting("allow_experimental_annoy_index", 1);
+        query_context->setSetting("allow_experimental_usearch_index", 1);
        query_context->setSetting("allow_experimental_bigint_types", 1);
        query_context->setSetting("allow_experimental_window_functions", 1);
        query_context->setSetting("allow_experimental_geo_types", 1);
--- a/src/Databases/IDatabase.cpp
+++ b/src/Databases/IDatabase.cpp
@ -23,11 +23,10 @@ StoragePtr IDatabase::getTable(const String & name, ContextPtr context) const
        return storage;
    TableNameHints hints(this->shared_from_this(), context);
    std::vector<String> names = hints.getHints(name);
-    if (!names.empty())
-    {
+    if (names.empty())
+        throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} does not exist", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name));
+    else
        throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} does not exist. Maybe you meant {}?", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name), backQuoteIfNeed(names[0]));
-    }
-    else throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} does not exist", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name));
 }

 std::vector<std::pair<ASTPtr, StoragePtr>> IDatabase::getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const
--- a/src/Dictionaries/CMakeLists.txt
+++ b/src/Dictionaries/CMakeLists.txt
@ -16,10 +16,20 @@ if (OMIT_HEAVY_DEBUG_SYMBOLS)
        PROPERTIES COMPILE_FLAGS -g0)
 endif()

-list(REMOVE_ITEM clickhouse_dictionaries_sources DictionaryFactory.cpp DictionarySourceFactory.cpp DictionaryStructure.cpp getDictionaryConfigurationFromAST.cpp)
-list(REMOVE_ITEM clickhouse_dictionaries_headers DictionaryFactory.h DictionarySourceFactory.h DictionaryStructure.h getDictionaryConfigurationFromAST.h)
+extract_into_parent_list(clickhouse_dictionaries_sources dbms_sources
+    DictionaryFactory.cpp
+    DictionarySourceFactory.cpp
+    DictionaryStructure.cpp
+    getDictionaryConfigurationFromAST.cpp
+)
+extract_into_parent_list(clickhouse_dictionaries_headers dbms_headers
+    DictionaryFactory.h
+    DictionarySourceFactory.h
+    DictionaryStructure.h
+    getDictionaryConfigurationFromAST.h
+)

-add_library(clickhouse_dictionaries ${clickhouse_dictionaries_sources})
+add_library(clickhouse_dictionaries ${clickhouse_dictionaries_headers} ${clickhouse_dictionaries_sources})

 target_link_libraries(clickhouse_dictionaries
    PRIVATE
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -124,6 +124,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
    format_settings.parquet.output_version = settings.output_format_parquet_version;
    format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching;
    format_settings.parquet.preserve_order = settings.input_format_parquet_preserve_order;
+    format_settings.parquet.filter_push_down = settings.input_format_parquet_filter_push_down;
    format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns;
    format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference;
    format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string;
@ -189,6 +190,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
    format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
    format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string;
    format_settings.orc.output_compression_method = settings.output_format_orc_compression_method;
+    format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder;
    format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
    format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
    format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference;
--- a/src/Formats/FormatFactory.h
+++ b/src/Formats/FormatFactory.h
@ -90,9 +90,6 @@ private:
            const FormatSettings & settings)>;

    // Incompatible with FileSegmentationEngine.
-    //
-    // In future we may also want to pass some information about WHERE conditions (SelectQueryInfo?)
-    // and get some information about projections (min/max/count per column per row group).
    using RandomAccessInputCreator = std::function<InputFormatPtr(
            ReadBuffer & buf,
            const Block & header,
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -231,6 +231,7 @@ struct FormatSettings
        bool allow_missing_columns = false;
        bool skip_columns_with_unsupported_types_in_schema_inference = false;
        bool case_insensitive_column_matching = false;
+        bool filter_push_down = true;
        std::unordered_set<int> skip_row_groups = {};
        bool output_string_as_string = false;
        bool output_fixed_string_as_fixed_byte_array = true;
@ -347,6 +348,7 @@ struct FormatSettings
        std::unordered_set<int> skip_stripes = {};
        bool output_string_as_string = false;
        ORCCompression output_compression_method = ORCCompression::NONE;
+        bool use_fast_decoder = true;
    } orc;

    /// For capnProto format we should determine how to
--- a/src/Functions/CMakeLists.txt
+++ b/src/Functions/CMakeLists.txt
@ -3,10 +3,22 @@ add_subdirectory(divide)
 include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake")
 add_headers_and_sources(clickhouse_functions .)

-list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp extractTimeZoneFromFunctionArguments.cpp FunctionsLogical.cpp)
-list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h FunctionHelpers.h extractTimeZoneFromFunctionArguments.h FunctionsLogical.h)
+extract_into_parent_list(clickhouse_functions_sources dbms_sources
+    IFunction.cpp
+    FunctionFactory.cpp
+    FunctionHelpers.cpp
+    extractTimeZoneFromFunctionArguments.cpp
+    FunctionsLogical.cpp
+)
+extract_into_parent_list(clickhouse_functions_headers dbms_headers
+    IFunction.h
+    FunctionFactory.h
+    FunctionHelpers.h
+    extractTimeZoneFromFunctionArguments.h
+    FunctionsLogical.h
+)

-add_library(clickhouse_functions_obj OBJECT ${clickhouse_functions_sources})
+add_library(clickhouse_functions_obj OBJECT ${clickhouse_functions_headers} ${clickhouse_functions_sources})

 list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_obj>)

--- a/src/Functions/FunctionDateOrDateTimeAddInterval.h
+++ b/src/Functions/FunctionDateOrDateTimeAddInterval.h
@ -1,6 +1,7 @@
 #pragma once
 #include <type_traits>
 #include <Core/AccurateComparison.h>
+#include <Core/DecimalFunctions.h>
 #include <Common/DateLUTImpl.h>

 #include <DataTypes/DataTypeDate.h>
@ -14,7 +15,6 @@
 #include <Functions/FunctionHelpers.h>
 #include <Functions/castTypeToEither.h>
 #include <Functions/extractTimeZoneFromFunctionArguments.h>
-#include <Functions/TransformDateTime64.h>

 #include <IO/WriteHelpers.h>

@ -36,7 +36,9 @@ namespace ErrorCodes
 /// Corresponding types:
 ///  - UInt16     => DataTypeDate
 ///  - UInt32     => DataTypeDateTime
+///  - Int32      => DataTypeDate32
 ///  - DateTime64 => DataTypeDateTime64
+///  - Int8       => error
 /// Please note that INPUT and OUTPUT types may differ, e.g.:
 ///  - 'AddSecondsImpl::execute(UInt32, ...) -> UInt32' is available to the ClickHouse users as 'addSeconds(DateTime, ...) -> DateTime'
 ///  - 'AddSecondsImpl::execute(UInt16, ...) -> UInt32' is available to the ClickHouse users as 'addSeconds(Date, ...) -> DateTime'
@ -45,35 +47,27 @@ struct AddNanosecondsImpl
 {
    static constexpr auto name = "addNanoseconds";

-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
-    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &, UInt16 scale = DataTypeDateTime64::default_scale)
-    {
-        Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(9 - scale);
-        auto division = std::div(t.fractional * multiplier + delta, static_cast<Int64>(1000000000));
-        return {t.whole * multiplier + division.quot, t.fractional * multiplier + delta};
-    }
-
    static inline NO_SANITIZE_UNDEFINED DateTime64
    execute(DateTime64 t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
    {
        Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(9 - scale);
-        return t * multiplier + delta;
+        return DateTime64(DecimalUtils::multiplyAdd(t.value, multiplier, delta));
    }

-    static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
+    static inline NO_SANITIZE_UNDEFINED DateTime64 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
    {
        Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(9);
-        return static_cast<UInt32>(t * multiplier + delta);
+        return DateTime64(DecimalUtils::multiplyAdd(static_cast<Int64>(t), multiplier, delta));
    }

-    static inline NO_SANITIZE_UNDEFINED DateTime64 execute(UInt16, Int64, const DateLUTImpl &, UInt16 = 0)
+    static inline NO_SANITIZE_UNDEFINED Int8 execute(UInt16, Int64, const DateLUTImpl &, UInt16 = 0)
    {
-        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "addNanoSeconds() cannot be used with Date");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "addNanoseconds() cannot be used with Date");
    }

-    static inline NO_SANITIZE_UNDEFINED DateTime64 execute(Int32, Int64, const DateLUTImpl &, UInt16 = 0)
+    static inline NO_SANITIZE_UNDEFINED Int8 execute(Int32, Int64, const DateLUTImpl &, UInt16 = 0)
    {
-        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "addNanoSeconds() cannot be used with Date32");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "addNanoseconds() cannot be used with Date32");
    }
 };

@ -81,43 +75,29 @@ struct AddMicrosecondsImpl
 {
    static constexpr auto name = "addMicroseconds";

-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
-    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
-    {
-        Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(std::abs(6 - scale));
-        if (scale <= 6)
-        {
-            auto division = std::div((t.fractional + delta), static_cast<Int64>(10e6));
-            return {t.whole * multiplier + division.quot, division.rem};
-        }
-        else
-        {
-            auto division = std::div((t.fractional + delta * multiplier), static_cast<Int64>(10e6 * multiplier));
-            return {t.whole + division.quot, division.rem};
-        }
-    }
-
    static inline NO_SANITIZE_UNDEFINED DateTime64
    execute(DateTime64 t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
    {
        Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(std::abs(6 - scale));
-        return scale <= 6 ? t * multiplier + delta : t + delta * multiplier;
+        return DateTime64(scale <= 6
+            ? DecimalUtils::multiplyAdd(t.value, multiplier, delta)
+            : DecimalUtils::multiplyAdd(delta, multiplier, t.value));
    }

-    static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
+    static inline NO_SANITIZE_UNDEFINED DateTime64 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
    {
        Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(6);
-        return static_cast<UInt32>(t * multiplier + delta);
+        return DateTime64(DecimalUtils::multiplyAdd(static_cast<Int64>(t), multiplier, delta));
    }

-    static inline NO_SANITIZE_UNDEFINED DateTime64 execute(UInt16, Int64, const DateLUTImpl &, UInt16 = 0)
+    static inline NO_SANITIZE_UNDEFINED Int8 execute(UInt16, Int64, const DateLUTImpl &, UInt16 = 0)
    {
-        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "addMicroSeconds() cannot be used with Date");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "addMicroseconds() cannot be used with Date");
    }

-    static inline NO_SANITIZE_UNDEFINED DateTime64 execute(Int32, Int64, const DateLUTImpl &, UInt16 = 0)
+    static inline NO_SANITIZE_UNDEFINED Int8 execute(Int32, Int64, const DateLUTImpl &, UInt16 = 0)
    {
-        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "addMicroSeconds() cannot be used with Date32");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "addMicroseconds() cannot be used with Date32");
    }
 };

@ -125,43 +105,29 @@ struct AddMillisecondsImpl
 {
    static constexpr auto name = "addMilliseconds";

-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
-    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &, UInt16 scale = DataTypeDateTime64::default_scale)
-    {
-        Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(std::abs(3 - scale));
-        if (scale <= 3)
-        {
-            auto division = std::div((t.fractional + delta), static_cast<Int64>(1000));
-            return {t.whole * multiplier + division.quot, division.rem};
-        }
-        else
-        {
-            auto division = std::div((t.fractional + delta * multiplier), static_cast<Int64>(1000 * multiplier));
-            return {t.whole + division.quot,division.rem};
-        }
-    }
-
    static inline NO_SANITIZE_UNDEFINED DateTime64
    execute(DateTime64 t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
    {
        Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(std::abs(3 - scale));
-        return scale <= 3 ? t * multiplier + delta : t + delta * multiplier;
+        return DateTime64(scale <= 3
+            ? DecimalUtils::multiplyAdd(t.value, multiplier, delta)
+            : DecimalUtils::multiplyAdd(delta, multiplier, t.value));
    }

-    static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
+    static inline NO_SANITIZE_UNDEFINED DateTime64 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
    {
        Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(3);
-        return static_cast<UInt32>(t * multiplier + delta);
+        return DateTime64(DecimalUtils::multiplyAdd(static_cast<Int64>(t), multiplier, delta));
    }

-    static inline NO_SANITIZE_UNDEFINED DateTime64 execute(UInt16, Int64, const DateLUTImpl &, UInt16 = 0)
+    static inline NO_SANITIZE_UNDEFINED Int8 execute(UInt16, Int64, const DateLUTImpl &, UInt16 = 0)
    {
-        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "addMilliSeconds() cannot be used with Date");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "addMilliseconds() cannot be used with Date");
    }

-    static inline NO_SANITIZE_UNDEFINED DateTime64 execute(Int32, Int64, const DateLUTImpl &, UInt16 = 0)
+    static inline NO_SANITIZE_UNDEFINED Int8 execute(Int32, Int64, const DateLUTImpl &, UInt16 = 0)
    {
-        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "addMilliSeconds() cannot be used with Date32");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "addMilliseconds() cannot be used with Date32");
    }
 };

@ -169,16 +135,10 @@ struct AddSecondsImpl
 {
    static constexpr auto name = "addSeconds";

-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
-    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
-    {
-        return {t.whole + delta, t.fractional};
-    }
-
    static inline NO_SANITIZE_UNDEFINED DateTime64
    execute(DateTime64 t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
    {
-        return t + delta * DecimalUtils::scaleMultiplier<DateTime64>(scale);
+        return DateTime64(DecimalUtils::multiplyAdd(delta, DecimalUtils::scaleMultiplier<DateTime64>(scale), t.value));
    }

    static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
@ -189,6 +149,7 @@ struct AddSecondsImpl
    static inline NO_SANITIZE_UNDEFINED Int64 execute(Int32 d, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
    {
        // use default datetime64 scale
+        static_assert(DataTypeDateTime64::default_scale == 3, "");
        return (time_zone.fromDayNum(ExtendedDayNum(d)) + delta) * 1000;
    }

@ -202,12 +163,6 @@ struct AddMinutesImpl
 {
    static constexpr auto name = "addMinutes";

-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
-    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
-    {
-        return {t.whole + delta * 60, t.fractional};
-    }
-
    static inline NO_SANITIZE_UNDEFINED DateTime64
    execute(DateTime64 t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
    {
@ -222,6 +177,7 @@ struct AddMinutesImpl
    static inline NO_SANITIZE_UNDEFINED Int64 execute(Int32 d, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
    {
        // use default datetime64 scale
+        static_assert(DataTypeDateTime64::default_scale == 3, "");
        return (time_zone.fromDayNum(ExtendedDayNum(d)) + delta * 60) * 1000;
    }

@ -235,12 +191,6 @@ struct AddHoursImpl
 {
    static constexpr auto name = "addHours";

-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
-    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
-    {
-        return {t.whole + delta * 3600, t.fractional};
-    }
-
    static inline NO_SANITIZE_UNDEFINED DateTime64
    execute(DateTime64 t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
    {
@ -255,6 +205,7 @@ struct AddHoursImpl
    static inline NO_SANITIZE_UNDEFINED Int64 execute(Int32 d, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
    {
        // use default datetime64 scale
+        static_assert(DataTypeDateTime64::default_scale == 3, "");
        return (time_zone.fromDayNum(ExtendedDayNum(d)) + delta * 3600) * 1000;
    }

@ -268,12 +219,6 @@ struct AddDaysImpl
 {
    static constexpr auto name = "addDays";

-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
-    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
-    {
-        return {time_zone.addDays(t.whole, delta), t.fractional};
-    }
-
    static inline NO_SANITIZE_UNDEFINED DateTime64
    execute(DateTime64 t, Int64 delta, const DateLUTImpl & time_zone, UInt16 scale = 0)
    {
@ -302,12 +247,6 @@ struct AddWeeksImpl
 {
    static constexpr auto name = "addWeeks";

-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
-    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
-    {
-        return {time_zone.addWeeks(t.whole, delta), t.fractional};
-    }
-
    static inline NO_SANITIZE_UNDEFINED DateTime64
    execute(DateTime64 t, Int64 delta, const DateLUTImpl & time_zone, UInt16 scale = 0)
    {
@ -336,12 +275,6 @@ struct AddMonthsImpl
 {
    static constexpr auto name = "addMonths";

-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
-    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
-    {
-        return {time_zone.addMonths(t.whole, delta), t.fractional};
-    }
-
    static inline NO_SANITIZE_UNDEFINED DateTime64
    execute(DateTime64 t, Int64 delta, const DateLUTImpl & time_zone, UInt16 scale = 0)
    {
@ -370,12 +303,6 @@ struct AddQuartersImpl
 {
    static constexpr auto name = "addQuarters";

-    static inline DecimalUtils::DecimalComponents<DateTime64>
-    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
-    {
-        return {time_zone.addQuarters(t.whole, delta), t.fractional};
-    }
-
    static inline NO_SANITIZE_UNDEFINED DateTime64
    execute(DateTime64 t, Int64 delta, const DateLUTImpl & time_zone, UInt16 scale = 0)
    {
@ -404,12 +331,6 @@ struct AddYearsImpl
 {
    static constexpr auto name = "addYears";

-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
-    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
-    {
-        return {time_zone.addYears(t.whole, delta), t.fractional};
-    }
-
    static inline NO_SANITIZE_UNDEFINED DateTime64
    execute(DateTime64 t, Int64 delta, const DateLUTImpl & time_zone, UInt16 scale = 0)
    {
@ -581,11 +502,11 @@ namespace date_and_time_type_details
 // Compile-time mapping of value (DataType::FieldType) types to corresponding DataType
 template <typename FieldType> struct ResultDataTypeMap {};
 template <> struct ResultDataTypeMap<UInt16>     { using ResultDataType = DataTypeDate; };
-template <> struct ResultDataTypeMap<Int16>      { using ResultDataType = DataTypeDate; };
 template <> struct ResultDataTypeMap<UInt32>     { using ResultDataType = DataTypeDateTime; };
 template <> struct ResultDataTypeMap<Int32>      { using ResultDataType = DataTypeDate32; };
 template <> struct ResultDataTypeMap<DateTime64> { using ResultDataType = DataTypeDateTime64; };
 template <> struct ResultDataTypeMap<Int64>      { using ResultDataType = DataTypeDateTime64; };
+template <> struct ResultDataTypeMap<Int8> { using ResultDataType = DataTypeInt8; }; // error
 }

 template <typename Transform>
@ -705,6 +626,10 @@ public:

            return std::make_shared<DataTypeDateTime64>(target_scale.value_or(DataTypeDateTime64::default_scale), std::move(timezone));
        }
+        else if constexpr (std::is_same_v<ResultDataType, DataTypeInt8>)
+        {
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{} cannot be used with {}", getName(), arguments[0].type->getName());
+        }

        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected result type in datetime add interval function");
    }
--- a/src/Functions/FunctionsBinaryRepresentation.cpp
+++ b/src/Functions/FunctionsBinaryRepresentation.cpp
@ -507,8 +507,8 @@ public:

                // use executeOnUInt instead of using executeOneString
                // because the latter one outputs the string in the memory order
-                Impl::executeOneUIntOrInt(uuid[i].toUnderType().items[0], end, false, false);
-                Impl::executeOneUIntOrInt(uuid[i].toUnderType().items[1], end, false, true);
+                Impl::executeOneUIntOrInt(UUIDHelpers::getHighBytes(uuid[i]), end, false, false);
+                Impl::executeOneUIntOrInt(UUIDHelpers::getLowBytes(uuid[i]), end, false, true);

                pos += end - begin;
                out_offsets[i] = pos;
--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@ -203,18 +203,15 @@ struct ConvertImpl
                    }
                }

-                if constexpr (std::is_same_v<FromDataType, DataTypeUUID> && std::is_same_v<ToDataType,DataTypeUInt128>)
+                if constexpr (std::is_same_v<FromDataType, DataTypeUUID> && std::is_same_v<ToDataType, DataTypeUInt128>)
                {
-                    static_assert(std::is_same_v<DataTypeUInt128::FieldType, DataTypeUUID::FieldType::UnderlyingType>, "UInt128 and UUID types must be same");
-                    if constexpr (std::endian::native == std::endian::little)
-                    {
-                        vec_to[i].items[1] = vec_from[i].toUnderType().items[0];
-                        vec_to[i].items[0] = vec_from[i].toUnderType().items[1];
-                    }
-                    else
-                    {
-                        vec_to[i] = vec_from[i].toUnderType();
-                    }
+                    static_assert(
+                        std::is_same_v<DataTypeUInt128::FieldType, DataTypeUUID::FieldType::UnderlyingType>,
+                        "UInt128 and UUID types must be same");
+
+                    vec_to[i].items[1] = vec_from[i].toUnderType().items[0];
+                    vec_to[i].items[0] = vec_from[i].toUnderType().items[1];
+
                    continue;
                }

--- a/src/Functions/currentDatabase.cpp
+++ b/src/Functions/currentDatabase.cpp
@ -55,6 +55,7 @@ REGISTER_FUNCTION(CurrentDatabase)
 {
    factory.registerFunction<FunctionCurrentDatabase>();
    factory.registerAlias("DATABASE", FunctionCurrentDatabase::name, FunctionFactory::CaseInsensitive);
+    factory.registerAlias("SCHEMA", FunctionCurrentDatabase::name, FunctionFactory::CaseInsensitive);
    factory.registerAlias("current_database", FunctionCurrentDatabase::name, FunctionFactory::CaseInsensitive);
 }

--- a/src/Functions/generateUUIDv4.cpp
+++ b/src/Functions/generateUUIDv4.cpp
@ -60,9 +60,8 @@ public:
        {
            /// https://tools.ietf.org/html/rfc4122#section-4.4

-            UInt128 & impl = uuid.toUnderType();
-            impl.items[0] = (impl.items[0] & 0xffffffffffff0fffull) | 0x0000000000004000ull;
-            impl.items[1] = (impl.items[1] & 0x3fffffffffffffffull) | 0x8000000000000000ull;
+            UUIDHelpers::getHighBytes(uuid) = (UUIDHelpers::getHighBytes(uuid) & 0xffffffffffff0fffull) | 0x0000000000004000ull;
+            UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & 0x3fffffffffffffffull) | 0x8000000000000000ull;
        }

        return col_res;
--- a/src/Functions/reinterpretAs.cpp
+++ b/src/Functions/reinterpretAs.cpp
@ -1,26 +1,27 @@
 #include <Functions/FunctionFactory.h>
-#include <Functions/castTypeToEither.h>
 #include <Functions/FunctionHelpers.h>
+#include <Functions/castTypeToEither.h>

 #include <Core/callOnTypeIndex.h>

-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeString.h>
-#include <DataTypes/DataTypeFixedString.h>
+#include <Columns/ColumnConst.h>
+#include <Columns/ColumnDecimal.h>
+#include <Columns/ColumnFixedString.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnVector.h>
 #include <DataTypes/DataTypeDate.h>
 #include <DataTypes/DataTypeDateTime.h>
-#include <DataTypes/DataTypeUUID.h>
-#include <DataTypes/DataTypeFactory.h>
-#include <DataTypes/DataTypesDecimal.h>
 #include <DataTypes/DataTypeDateTime64.h>
-#include <Columns/ColumnString.h>
-#include <Columns/ColumnFixedString.h>
-#include <Columns/ColumnConst.h>
-#include <Columns/ColumnVector.h>
-#include <Columns/ColumnDecimal.h>
+#include <DataTypes/DataTypeFactory.h>
+#include <DataTypes/DataTypeFixedString.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypeUUID.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypesNumber.h>

-#include <Common/typeid_cast.h>
+#include <Common/TransformEndianness.hpp>
 #include <Common/memcpySmall.h>
+#include <Common/typeid_cast.h>

 #include <base/unaligned.h>

@ -261,8 +262,10 @@ public:
                            memcpy(static_cast<void*>(&to[i]), static_cast<const void*>(&from[i]), copy_size);
                        else
                        {
-                            size_t offset_to = sizeof(To) > sizeof(From) ? sizeof(To) - sizeof(From) : 0;
-                            memcpy(reinterpret_cast<char*>(&to[i]) + offset_to, static_cast<const void*>(&from[i]), copy_size);
+                            // Handle the cases of both 128-bit representation to 256-bit and 128-bit to 64-bit or lower.
+                            const size_t offset_from = sizeof(From) > sizeof(To) ? sizeof(From) - sizeof(To) : 0;
+                            const size_t offset_to = sizeof(To) > sizeof(From) ? sizeof(To) - sizeof(From) : 0;
+                            memcpy(reinterpret_cast<char *>(&to[i]) + offset_to, reinterpret_cast<const char *>(&from[i]) + offset_from, copy_size);
                        }

                    }
@ -315,7 +318,11 @@ private:
        {
            std::string_view data = src.getDataAt(i).toView();

+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
            memcpy(&data_to[offset], data.data(), std::min(n, data.size()));
+#else
+            reverseMemcpy(&data_to[offset], data.data(), std::min(n, data.size()));
+#endif
            offset += n;
        }
    }
@ -326,7 +333,11 @@ private:
        ColumnFixedString::Chars & data_to = dst.getChars();
        data_to.resize(n * rows);

+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
        memcpy(data_to.data(), src.getRawData().data(), data_to.size());
+#else
+        reverseMemcpy(data_to.data(), src.getRawData().data(), data_to.size());
+#endif
    }

    static void NO_INLINE executeToString(const IColumn & src, ColumnString & dst)
--- a/src/Functions/toValidUTF8.cpp
+++ b/src/Functions/toValidUTF8.cpp
@ -7,6 +7,8 @@

 #include <string_view>

+#include <base/simd.h>
+
 #ifdef __SSE2__
 #    include <emmintrin.h>
 #endif
@ -73,16 +75,13 @@ struct ToValidUTF8Impl
            /// Fast skip of ASCII for aarch64.
            static constexpr size_t SIMD_BYTES = 16;
            const char * simd_end = p + (end - p) / SIMD_BYTES * SIMD_BYTES;
-            /// Returns a 64 bit mask of nibbles (4 bits for each byte).
-            auto get_nibble_mask = [](uint8x16_t input) -> uint64_t
-            { return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(input), 4)), 0); };
            /// Other options include
            /// vmaxvq_u8(input) < 0b10000000;
            /// Used by SIMDJSON, has latency 3 for M1, 6 for everything else
            /// SIMDJSON uses it for 64 byte masks, so it's a little different.
            /// vmaxvq_u32(vandq_u32(input, vdupq_n_u32(0x80808080))) // u32 version has latency 3
            /// shrn version has universally <=3 cycles, on servers 2 cycles.
-            while (p < simd_end && get_nibble_mask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
+            while (p < simd_end && getNibbleMask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
                p += SIMD_BYTES;

            if (!(p < end))
--- a/src/IO/ReadBufferFromString.h
+++ b/src/IO/ReadBufferFromString.h
@ -19,7 +19,10 @@ public:
 class ReadBufferFromOwnString : public String, public ReadBufferFromString
 {
 public:
-    explicit ReadBufferFromOwnString(const String & s_): String(s_), ReadBufferFromString(*this) {}
+    template <typename S>
+    explicit ReadBufferFromOwnString(S && s_) : String(std::forward<S>(s_)), ReadBufferFromString(*this)
+    {
+    }
 };

 }
--- a/src/IO/ReadHelpers.cpp
+++ b/src/IO/ReadHelpers.cpp
@ -12,6 +12,8 @@
 #include <cstdlib>
 #include <bit>

+#include <base/simd.h>
+
 #ifdef __SSE2__
    #include <emmintrin.h>
 #endif
@ -51,36 +53,25 @@ UUID parseUUID(std::span<const UInt8> src)
 {
    UUID uuid;
    const auto * src_ptr = src.data();
-    auto * dst = reinterpret_cast<UInt8 *>(&uuid);
    const auto size = src.size();

 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-    const std::reverse_iterator dst_it(dst + sizeof(UUID));
+    const std::reverse_iterator dst(reinterpret_cast<UInt8 *>(&uuid) + sizeof(UUID));
+#else
+    auto * dst = reinterpret_cast<UInt8 *>(&uuid);
 #endif
    if (size == 36)
    {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-        parseHex<4>(src_ptr, dst_it + 8);
-        parseHex<2>(src_ptr + 9, dst_it + 12);
-        parseHex<2>(src_ptr + 14, dst_it + 14);
-        parseHex<2>(src_ptr + 19, dst_it);
-        parseHex<6>(src_ptr + 24, dst_it + 2);
-#else
-        parseHex<4>(src_ptr, dst);
-        parseHex<2>(src_ptr + 9, dst + 4);
-        parseHex<2>(src_ptr + 14, dst + 6);
-        parseHex<2>(src_ptr + 19, dst + 8);
-        parseHex<6>(src_ptr + 24, dst + 10);
-#endif
+        parseHex<4>(src_ptr, dst + 8);
+        parseHex<2>(src_ptr + 9, dst + 12);
+        parseHex<2>(src_ptr + 14, dst + 14);
+        parseHex<2>(src_ptr + 19, dst);
+        parseHex<6>(src_ptr + 24, dst + 2);
    }
    else if (size == 32)
    {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-        parseHex<8>(src_ptr, dst_it + 8);
-        parseHex<8>(src_ptr + 16, dst_it);
-#else
-        parseHex<16>(src_ptr, dst);
-#endif
+        parseHex<8>(src_ptr, dst + 8);
+        parseHex<8>(src_ptr + 16, dst);
    }
    else
        throw Exception(ErrorCodes::CANNOT_PARSE_UUID, "Unexpected length when trying to parse UUID ({})", size);
@ -819,14 +810,11 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV &
                auto rc = vdupq_n_u8('\r');
                auto nc = vdupq_n_u8('\n');
                auto dc = vdupq_n_u8(delimiter);
-                /// Returns a 64 bit mask of nibbles (4 bits for each byte).
-                auto get_nibble_mask = [](uint8x16_t input) -> uint64_t
-                { return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(input), 4)), 0); };
                for (; next_pos + 15 < buf.buffer().end(); next_pos += 16)
                {
                    uint8x16_t bytes = vld1q_u8(reinterpret_cast<const uint8_t *>(next_pos));
                    auto eq = vorrq_u8(vorrq_u8(vceqq_u8(bytes, rc), vceqq_u8(bytes, nc)), vceqq_u8(bytes, dc));
-                    uint64_t bit_mask = get_nibble_mask(eq);
+                    uint64_t bit_mask = getNibbleMask(eq);
                    if (bit_mask)
                    {
                        next_pos += std::countr_zero(bit_mask) >> 2;
--- a/src/IO/ReadHelpers.h
+++ b/src/IO/ReadHelpers.h
@ -116,6 +116,13 @@ inline void readPODBinary(T & x, ReadBuffer & buf)
    buf.readStrict(reinterpret_cast<char *>(&x), sizeof(x)); /// NOLINT
 }

+inline void readUUIDBinary(UUID & x, ReadBuffer & buf)
+{
+    auto & uuid = x.toUnderType();
+    readPODBinary(uuid.items[0], buf);
+    readPODBinary(uuid.items[1], buf);
+}
+
 template <typename T>
 inline void readIntBinary(T & x, ReadBuffer & buf)
 {
@ -1106,16 +1113,26 @@ inline void readBinary(Decimal64 & x, ReadBuffer & buf) { readPODBinary(x, buf);
 inline void readBinary(Decimal128 & x, ReadBuffer & buf) { readPODBinary(x, buf); }
 inline void readBinary(Decimal256 & x, ReadBuffer & buf) { readPODBinary(x.value, buf); }
 inline void readBinary(LocalDate & x, ReadBuffer & buf) { readPODBinary(x, buf); }
-inline void readBinary(UUID & x, ReadBuffer & buf) { readPODBinary(x, buf); }
 inline void readBinary(IPv4 & x, ReadBuffer & buf) { readPODBinary(x, buf); }
 inline void readBinary(IPv6 & x, ReadBuffer & buf) { readPODBinary(x, buf); }

+inline void readBinary(UUID & x, ReadBuffer & buf)
+{
+    readUUIDBinary(x, buf);
+}
+
+inline void readBinary(CityHash_v1_0_2::uint128 & x, ReadBuffer & buf)
+{
+    readPODBinary(x.low64, buf);
+    readPODBinary(x.high64, buf);
+}
+
 inline void readBinary(StackTrace::FramePointers & x, ReadBuffer & buf) { readPODBinary(x, buf); }

 template <std::endian endian, typename T>
 inline void readBinaryEndian(T & x, ReadBuffer & buf)
 {
-    readPODBinary(x, buf);
+    readBinary(x, buf);
    transformEndianness<endian>(x);
 }

--- a/src/IO/WriteBufferValidUTF8.cpp
+++ b/src/IO/WriteBufferValidUTF8.cpp
@ -1,6 +1,7 @@
 #include <Poco/UTF8Encoding.h>
 #include <IO/WriteBufferValidUTF8.h>
 #include <base/types.h>
+#include <base/simd.h>

 #ifdef __SSE2__
    #include <emmintrin.h>
@ -84,16 +85,13 @@ void WriteBufferValidUTF8::nextImpl()
        /// Fast skip of ASCII for aarch64.
        static constexpr size_t SIMD_BYTES = 16;
        const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
-        /// Returns a 64 bit mask of nibbles (4 bits for each byte).
-        auto get_nibble_mask = [](uint8x16_t input) -> uint64_t
-        { return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(input), 4)), 0); };
        /// Other options include
        /// vmaxvq_u8(input) < 0b10000000;
        /// Used by SIMDJSON, has latency 3 for M1, 6 for everything else
        /// SIMDJSON uses it for 64 byte masks, so it's a little different.
        /// vmaxvq_u32(vandq_u32(input, vdupq_n_u32(0x80808080))) // u32 version has latency 3
        /// shrn version has universally <=3 cycles, on servers 2 cycles.
-        while (p < simd_end && get_nibble_mask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
+        while (p < simd_end && getNibbleMask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
            p += SIMD_BYTES;

        if (!(p < pos))
--- a/src/IO/WriteHelpers.cpp
+++ b/src/IO/WriteHelpers.cpp
@ -23,30 +23,23 @@ void formatHex(IteratorSrc src, IteratorDst dst, size_t num_bytes)
 std::array<char, 36> formatUUID(const UUID & uuid)
 {
    std::array<char, 36> dst;
-    const auto * src_ptr = reinterpret_cast<const UInt8 *>(&uuid);
    auto * dst_ptr = dst.data();
+
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-    const std::reverse_iterator src_it(src_ptr + 16);
-    formatHex(src_it + 8, dst_ptr, 4);
-    dst[8] = '-';
-    formatHex(src_it + 12, dst_ptr + 9, 2);
-    dst[13] = '-';
-    formatHex(src_it + 14, dst_ptr + 14, 2);
-    dst[18] = '-';
-    formatHex(src_it, dst_ptr + 19, 2);
-    dst[23] = '-';
-    formatHex(src_it + 2, dst_ptr + 24, 6);
+    const auto * src_ptr = reinterpret_cast<const UInt8 *>(&uuid);
+    const std::reverse_iterator src(src_ptr + 16);
 #else
-    formatHex(src_ptr, dst_ptr, 4);
-    dst[8] = '-';
-    formatHex(src_ptr + 4, dst_ptr + 9, 2);
-    dst[13] = '-';
-    formatHex(src_ptr + 6, dst_ptr + 14, 2);
-    dst[18] = '-';
-    formatHex(src_ptr + 8, dst_ptr + 19, 2);
-    dst[23] = '-';
-    formatHex(src_ptr + 10, dst_ptr + 24, 6);
+    const auto * src = reinterpret_cast<const UInt8 *>(&uuid);
 #endif
+    formatHex(src + 8, dst_ptr, 4);
+    dst[8] = '-';
+    formatHex(src + 12, dst_ptr + 9, 2);
+    dst[13] = '-';
+    formatHex(src + 14, dst_ptr + 14, 2);
+    dst[18] = '-';
+    formatHex(src, dst_ptr + 19, 2);
+    dst[23] = '-';
+    formatHex(src + 2, dst_ptr + 24, 6);

    return dst;
 }
--- a/src/IO/WriteHelpers.h
+++ b/src/IO/WriteHelpers.h
@ -88,6 +88,13 @@ inline void writePODBinary(const T & x, WriteBuffer & buf)
    buf.write(reinterpret_cast<const char *>(&x), sizeof(x)); /// NOLINT
 }

+inline void writeUUIDBinary(const UUID & x, WriteBuffer & buf)
+{
+    const auto & uuid = x.toUnderType();
+    writePODBinary(uuid.items[0], buf);
+    writePODBinary(uuid.items[1], buf);
+}
+
 template <typename T>
 inline void writeIntBinary(const T & x, WriteBuffer & buf)
 {
@ -882,10 +889,20 @@ inline void writeBinary(const Decimal128 & x, WriteBuffer & buf) { writePODBinar
 inline void writeBinary(const Decimal256 & x, WriteBuffer & buf) { writePODBinary(x.value, buf); }
 inline void writeBinary(const LocalDate & x, WriteBuffer & buf) { writePODBinary(x, buf); }
 inline void writeBinary(const LocalDateTime & x, WriteBuffer & buf) { writePODBinary(x, buf); }
-inline void writeBinary(const UUID & x, WriteBuffer & buf) { writePODBinary(x, buf); }
 inline void writeBinary(const IPv4 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
 inline void writeBinary(const IPv6 & x, WriteBuffer & buf) { writePODBinary(x, buf); }

+inline void writeBinary(const UUID & x, WriteBuffer & buf)
+{
+    writeUUIDBinary(x, buf);
+}
+
+inline void writeBinary(const CityHash_v1_0_2::uint128 & x, WriteBuffer & buf)
+{
+    writePODBinary(x.low64, buf);
+    writePODBinary(x.high64, buf);
+}
+
 inline void writeBinary(const StackTrace::FramePointers & x, WriteBuffer & buf) { writePODBinary(x, buf); }

 /// Methods for outputting the value in text form for a tab-separated format.
@ -1208,7 +1225,7 @@ template <std::endian endian, typename T>
 inline void writeBinaryEndian(T x, WriteBuffer & buf)
 {
    transformEndianness<endian>(x);
-    writePODBinary(x, buf);
+    writeBinary(x, buf);
 }

 template <typename T>
--- a/src/Interpreters/Cache/QueryCache.cpp
+++ b/src/Interpreters/Cache/QueryCache.cpp
@ -471,6 +471,21 @@ std::unique_ptr<SourceFromChunks> QueryCache::Reader::getSourceExtremes()
    return std::move(source_from_chunks_extremes);
 }

+QueryCache::QueryCache(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes_, size_t max_entry_size_in_rows_)
+    : cache(std::make_unique<TTLCachePolicy<Key, Entry, KeyHasher, QueryCacheEntryWeight, IsStale>>(std::make_unique<PerUserTTLCachePolicyUserQuota>()))
+{
+    updateConfiguration(max_size_in_bytes, max_entries, max_entry_size_in_bytes_, max_entry_size_in_rows_);
+}
+
+void QueryCache::updateConfiguration(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes_, size_t max_entry_size_in_rows_)
+{
+    std::lock_guard lock(mutex);
+    cache.setMaxSize(max_size_in_bytes);
+    cache.setMaxCount(max_entries);
+    max_entry_size_in_bytes = max_entry_size_in_bytes_;
+    max_entry_size_in_rows = max_entry_size_in_rows_;
+}
+
 QueryCache::Reader QueryCache::createReader(const Key & key)
 {
    std::lock_guard lock(mutex);
@ -488,9 +503,9 @@ QueryCache::Writer QueryCache::createWriter(const Key & key, std::chrono::millis
    return Writer(cache, key, max_entry_size_in_bytes, max_entry_size_in_rows, min_query_runtime, squash_partial_results, max_block_size);
 }

-void QueryCache::reset()
+void QueryCache::clear()
 {
-    cache.reset();
+    cache.clear();
    std::lock_guard lock(mutex);
    times_executed.clear();
 }
@ -521,19 +536,4 @@ std::vector<QueryCache::Cache::KeyMapped> QueryCache::dump() const
    return cache.dump();
 }

-QueryCache::QueryCache(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes_, size_t max_entry_size_in_rows_)
-    : cache(std::make_unique<TTLCachePolicy<Key, Entry, KeyHasher, QueryCacheEntryWeight, IsStale>>(std::make_unique<PerUserTTLCachePolicyUserQuota>()))
-{
-    updateConfiguration(max_size_in_bytes, max_entries, max_entry_size_in_bytes_, max_entry_size_in_rows_);
-}
-
-void QueryCache::updateConfiguration(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes_, size_t max_entry_size_in_rows_)
-{
-    std::lock_guard lock(mutex);
-    cache.setMaxSize(max_size_in_bytes);
-    cache.setMaxCount(max_entries);
-    max_entry_size_in_bytes = max_entry_size_in_bytes_;
-    max_entry_size_in_rows = max_entry_size_in_rows_;
-}
-
 }
--- a/src/Interpreters/Cache/QueryCache.h
+++ b/src/Interpreters/Cache/QueryCache.h
@ -180,7 +180,7 @@ public:
    Reader createReader(const Key & key);
    Writer createWriter(const Key & key, std::chrono::milliseconds min_query_runtime, bool squash_partial_results, size_t max_block_size, size_t max_query_cache_size_in_bytes_quota, size_t max_query_cache_entries_quota);

-    void reset();
+    void clear();

    size_t weight() const;
    size_t count() const;
--- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
+++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
@ -14,6 +14,7 @@

 #include <Client/IConnections.h>
 #include <Common/logger_useful.h>
+#include <Common/FailPoint.h>
 #include <Processors/QueryPlan/QueryPlan.h>
 #include <Processors/QueryPlan/ReadFromRemote.h>
 #include <Processors/QueryPlan/ExpressionStep.h>
@ -35,6 +36,11 @@ namespace ErrorCodes
    extern const int ALL_REPLICAS_ARE_STALE;
 }

+namespace FailPoints
+{
+    extern const char use_delayed_remote_source[];
+}
+
 namespace ClusterProxy
 {

@ -134,6 +140,12 @@ void SelectStreamFactory::createForShard(

    const auto & settings = context->getSettingsRef();

+    fiu_do_on(FailPoints::use_delayed_remote_source,
+    {
+        emplace_remote_stream(/*lazy=*/true, /*local_delay=*/999999);
+        return;
+    });
+
    if (settings.prefer_localhost_replica && shard_info.isLocal())
    {
        StoragePtr main_table_storage;
--- a/src/Interpreters/ConcurrentHashJoin.h
+++ b/src/Interpreters/ConcurrentHashJoin.h
@ -3,7 +3,6 @@
 #include <condition_variable>
 #include <memory>
 #include <optional>
-#include <Functions/FunctionsLogical.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/ExpressionActions.h>
 #include <Interpreters/HashJoin.h>
@ -33,7 +32,13 @@ class ConcurrentHashJoin : public IJoin
 {

 public:
-    explicit ConcurrentHashJoin(ContextPtr context_, std::shared_ptr<TableJoin> table_join_, size_t slots_, const Block & right_sample_block, bool any_take_last_row_ = false);
+    explicit ConcurrentHashJoin(
+        ContextPtr context_,
+        std::shared_ptr<TableJoin> table_join_,
+        size_t slots_,
+        const Block & right_sample_block,
+        bool any_take_last_row_ = false);
+
    ~ConcurrentHashJoin() override = default;

    std::string getName() const override { return "ConcurrentHashJoin"; }
@ -67,7 +72,6 @@ private:

    IColumn::Selector selectDispatchBlock(const Strings & key_columns_names, const Block & from_block);
    Blocks dispatchBlock(const Strings & key_columns_names, const Block & from_block);
-
 };

 }
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -548,7 +548,7 @@ struct ContextSharedPart : boost::noncopyable
              */
 #if USE_EMBEDDED_COMPILER
            if (auto * cache = CompiledExpressionCacheFactory::instance().tryGetCache())
-                cache->reset();
+                cache->clear();
 #endif

            /// Preemptive destruction is important, because these objects may have a refcount to ContextShared (cyclic reference).
@ -2278,6 +2278,16 @@ void Context::setUncompressedCache(const String & uncompressed_cache_policy, siz
    shared->uncompressed_cache = std::make_shared<UncompressedCache>(uncompressed_cache_policy, max_size_in_bytes);
 }

+void Context::updateUncompressedCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
+{
+    auto lock = getLock();
+
+    if (!shared->uncompressed_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Uncompressed cache was not created yet.");
+
+    size_t max_size_in_bytes = config.getUInt64("uncompressed_cache_size", DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE);
+    shared->uncompressed_cache->setMaxSize(max_size_in_bytes);
+}

 UncompressedCachePtr Context::getUncompressedCache() const
 {
@ -2285,14 +2295,13 @@ UncompressedCachePtr Context::getUncompressedCache() const
    return shared->uncompressed_cache;
 }

-
 void Context::clearUncompressedCache() const
 {
    auto lock = getLock();
-    if (shared->uncompressed_cache)
-        shared->uncompressed_cache->reset();
-}

+    if (shared->uncompressed_cache)
+        shared->uncompressed_cache->clear();
+}

 void Context::setMarkCache(const String & mark_cache_policy, size_t cache_size_in_bytes)
 {
@ -2304,6 +2313,17 @@ void Context::setMarkCache(const String & mark_cache_policy, size_t cache_size_i
    shared->mark_cache = std::make_shared<MarkCache>(mark_cache_policy, cache_size_in_bytes);
 }

+void Context::updateMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
+{
+    auto lock = getLock();
+
+    if (!shared->mark_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Mark cache was not created yet.");
+
+    size_t max_size_in_bytes = config.getUInt64("mark_cache_size", DEFAULT_MARK_CACHE_MAX_SIZE);
+    shared->mark_cache->setMaxSize(max_size_in_bytes);
+}
+
 MarkCachePtr Context::getMarkCache() const
 {
    auto lock = getLock();
@ -2313,8 +2333,9 @@ MarkCachePtr Context::getMarkCache() const
 void Context::clearMarkCache() const
 {
    auto lock = getLock();
+
    if (shared->mark_cache)
-        shared->mark_cache->reset();
+        shared->mark_cache->clear();
 }

 ThreadPool & Context::getLoadMarksThreadpool() const
@ -2342,20 +2363,30 @@ void Context::setIndexUncompressedCache(size_t max_size_in_bytes)
    shared->index_uncompressed_cache = std::make_shared<UncompressedCache>(max_size_in_bytes);
 }

+void Context::updateIndexUncompressedCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
+{
+    auto lock = getLock();
+
+    if (!shared->index_uncompressed_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Index uncompressed cache was not created yet.");
+
+    size_t max_size_in_bytes = config.getUInt64("index_uncompressed_cache_size", DEFAULT_INDEX_UNCOMPRESSED_CACHE_MAX_SIZE);
+    shared->index_uncompressed_cache->setMaxSize(max_size_in_bytes);
+}
+
 UncompressedCachePtr Context::getIndexUncompressedCache() const
 {
    auto lock = getLock();
    return shared->index_uncompressed_cache;
 }

-
 void Context::clearIndexUncompressedCache() const
 {
    auto lock = getLock();
-    if (shared->index_uncompressed_cache)
-        shared->index_uncompressed_cache->reset();
-}

+    if (shared->index_uncompressed_cache)
+        shared->index_uncompressed_cache->clear();
+}

 void Context::setIndexMarkCache(size_t cache_size_in_bytes)
 {
@ -2367,6 +2398,17 @@ void Context::setIndexMarkCache(size_t cache_size_in_bytes)
    shared->index_mark_cache = std::make_shared<MarkCache>(cache_size_in_bytes);
 }

+void Context::updateIndexMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
+{
+    auto lock = getLock();
+
+    if (!shared->index_mark_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Index mark cache was not created yet.");
+
+    size_t max_size_in_bytes = config.getUInt64("index_mark_cache_size", DEFAULT_INDEX_MARK_CACHE_MAX_SIZE);
+    shared->index_mark_cache->setMaxSize(max_size_in_bytes);
+}
+
 MarkCachePtr Context::getIndexMarkCache() const
 {
    auto lock = getLock();
@ -2376,8 +2418,9 @@ MarkCachePtr Context::getIndexMarkCache() const
 void Context::clearIndexMarkCache() const
 {
    auto lock = getLock();
+
    if (shared->index_mark_cache)
-        shared->index_mark_cache->reset();
+        shared->index_mark_cache->clear();
 }

 void Context::setMMappedFileCache(size_t cache_size_in_num_entries)
@ -2390,6 +2433,17 @@ void Context::setMMappedFileCache(size_t cache_size_in_num_entries)
    shared->mmap_cache = std::make_shared<MMappedFileCache>(cache_size_in_num_entries);
 }

+void Context::updateMMappedFileCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
+{
+    auto lock = getLock();
+
+    if (!shared->mmap_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Mapped file cache was not created yet.");
+
+    size_t max_size_in_bytes = config.getUInt64("mmap_cache_size", DEFAULT_MMAP_CACHE_MAX_SIZE);
+    shared->mmap_cache->setMaxSize(max_size_in_bytes);
+}
+
 MMappedFileCachePtr Context::getMMappedFileCache() const
 {
    auto lock = getLock();
@ -2399,8 +2453,9 @@ MMappedFileCachePtr Context::getMMappedFileCache() const
 void Context::clearMMappedFileCache() const
 {
    auto lock = getLock();
+
    if (shared->mmap_cache)
-        shared->mmap_cache->reset();
+        shared->mmap_cache->clear();
 }

 void Context::setQueryCache(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes, size_t max_entry_size_in_rows)
@ -2416,14 +2471,15 @@ void Context::setQueryCache(size_t max_size_in_bytes, size_t max_entries, size_t
 void Context::updateQueryCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
 {
    auto lock = getLock();
-    if (shared->query_cache)
-    {
-        size_t max_size_in_bytes = config.getUInt64("query_cache.max_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_SIZE);
-        size_t max_entries = config.getUInt64("query_cache.max_entries", DEFAULT_QUERY_CACHE_MAX_ENTRIES);
-        size_t max_entry_size_in_bytes = config.getUInt64("query_cache.max_entry_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_BYTES);
-        size_t max_entry_size_in_rows = config.getUInt64("query_cache.max_entry_rows_in_rows", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_ROWS);
-        shared->query_cache->updateConfiguration(max_size_in_bytes, max_entries, max_entry_size_in_bytes, max_entry_size_in_rows);
-    }
+
+    if (!shared->query_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Query cache was not created yet.");
+
+    size_t max_size_in_bytes = config.getUInt64("query_cache.max_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_SIZE);
+    size_t max_entries = config.getUInt64("query_cache.max_entries", DEFAULT_QUERY_CACHE_MAX_ENTRIES);
+    size_t max_entry_size_in_bytes = config.getUInt64("query_cache.max_entry_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_BYTES);
+    size_t max_entry_size_in_rows = config.getUInt64("query_cache.max_entry_rows_in_rows", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_ROWS);
+    shared->query_cache->updateConfiguration(max_size_in_bytes, max_entries, max_entry_size_in_bytes, max_entry_size_in_rows);
 }

 QueryCachePtr Context::getQueryCache() const
@ -2435,30 +2491,36 @@ QueryCachePtr Context::getQueryCache() const
 void Context::clearQueryCache() const
 {
    auto lock = getLock();
+
    if (shared->query_cache)
-        shared->query_cache->reset();
+        shared->query_cache->clear();
 }

 void Context::clearCaches() const
 {
    auto lock = getLock();

-    if (shared->uncompressed_cache)
-        shared->uncompressed_cache->reset();
+    if (!shared->uncompressed_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Uncompressed cache was not created yet.");
+    shared->uncompressed_cache->clear();

-    if (shared->mark_cache)
-        shared->mark_cache->reset();
+    if (!shared->mark_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Mark cache was not created yet.");
+    shared->mark_cache->clear();

-    if (shared->index_uncompressed_cache)
-        shared->index_uncompressed_cache->reset();
+    if (!shared->index_uncompressed_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Index uncompressed cache was not created yet.");
+    shared->index_uncompressed_cache->clear();

-    if (shared->index_mark_cache)
-        shared->index_mark_cache->reset();
+    if (!shared->index_mark_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Index mark cache was not created yet.");
+    shared->index_mark_cache->clear();

-    if (shared->mmap_cache)
-        shared->mmap_cache->reset();
+    if (!shared->mmap_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Mmapped file cache was not created yet.");
+    shared->mmap_cache->clear();

-    /// Intentionally not dropping the query cache which is transactionally inconsistent by design.
+    /// Intentionally not clearing the query cache which is transactionally inconsistent by design.
 }

 ThreadPool & Context::getPrefetchThreadpool() const
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -922,33 +922,32 @@ public:

    /// --- Caches ------------------------------------------------------------------------------------------

-    /// Create a cache of uncompressed blocks of specified size. This can be done only once.
    void setUncompressedCache(const String & uncompressed_cache_policy, size_t max_size_in_bytes);
+    void updateUncompressedCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
    std::shared_ptr<UncompressedCache> getUncompressedCache() const;
    void clearUncompressedCache() const;

-    /// Create a cache of marks of specified size. This can be done only once.
    void setMarkCache(const String & mark_cache_policy, size_t cache_size_in_bytes);
+    void updateMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
    std::shared_ptr<MarkCache> getMarkCache() const;
    void clearMarkCache() const;
    ThreadPool & getLoadMarksThreadpool() const;

-    /// Create a cache of index uncompressed blocks of specified size. This can be done only once.
    void setIndexUncompressedCache(size_t max_size_in_bytes);
+    void updateIndexUncompressedCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
    std::shared_ptr<UncompressedCache> getIndexUncompressedCache() const;
    void clearIndexUncompressedCache() const;

-    /// Create a cache of index marks of specified size. This can be done only once.
    void setIndexMarkCache(size_t cache_size_in_bytes);
+    void updateIndexMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
    std::shared_ptr<MarkCache> getIndexMarkCache() const;
    void clearIndexMarkCache() const;

-    /// Create a cache of mapped files to avoid frequent open/map/unmap/close and to reuse from several threads.
    void setMMappedFileCache(size_t cache_size_in_num_entries);
+    void updateMMappedFileCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
    std::shared_ptr<MMappedFileCache> getMMappedFileCache() const;
    void clearMMappedFileCache() const;

-    /// Create a cache of query results for statements which run repeatedly.
    void setQueryCache(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes, size_t max_entry_size_in_rows);
    void updateQueryCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
    std::shared_ptr<QueryCache> getQueryCache() const;
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Subproject commit 0a92994d729ff76a58f692d3028ca1b64b145d91`
				`@ -0,0 +1 @@`
				`Subproject commit de2cb75b9e9e3389d5e1e51fd9f8ed151f3c17cf`
				`@ -0,0 +1 @@`
				`Subproject commit 851a59e0e3063ee0e23089062090a73fd3de482d`
				`@ -0,0 +1 @@`
				`Subproject commit 387b78b28b17b8954024ffc81e97cbcfa10d1f30`