Merge branch 'master' into working

2024-11-26 01:22:04 +00:00 · 2023-08-09 11:35:14 +02:00 · 2023-08-09 11:35:14 +02:00 · 47cccee657
commit 47cccee657
parent 11b5a3dd22 3ffffb0b5e
237 changed files with 7110 additions and 827 deletions
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@ -3,6 +3,9 @@ name: BackportPR
 env:
  # Force the stdout and stderr streams to be unbuffered
  PYTHONUNBUFFERED: 1
+  # Export system tables to ClickHouse Cloud
+  CLICKHOUSE_CI_LOGS_HOST: ${{ secrets.CLICKHOUSE_CI_LOGS_HOST }}
+  CLICKHOUSE_CI_LOGS_PASSWORD: ${{ secrets.CLICKHOUSE_CI_LOGS_PASSWORD }}

 on: # yamllint disable-line rule:truthy
  push:
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -3,6 +3,9 @@ name: MasterCI
 env:
  # Force the stdout and stderr streams to be unbuffered
  PYTHONUNBUFFERED: 1
+  # Export system tables to ClickHouse Cloud
+  CLICKHOUSE_CI_LOGS_HOST: ${{ secrets.CLICKHOUSE_CI_LOGS_HOST }}
+  CLICKHOUSE_CI_LOGS_PASSWORD: ${{ secrets.CLICKHOUSE_CI_LOGS_PASSWORD }}

 on: # yamllint disable-line rule:truthy
  push:
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -3,6 +3,9 @@ name: PullRequestCI
 env:
  # Force the stdout and stderr streams to be unbuffered
  PYTHONUNBUFFERED: 1
+  # Export system tables to ClickHouse Cloud
+  CLICKHOUSE_CI_LOGS_HOST: ${{ secrets.CLICKHOUSE_CI_LOGS_HOST }}
+  CLICKHOUSE_CI_LOGS_PASSWORD: ${{ secrets.CLICKHOUSE_CI_LOGS_PASSWORD }}

 on:  # yamllint disable-line rule:truthy
  pull_request:
--- a/.github/workflows/release_branches.yml
+++ b/.github/workflows/release_branches.yml
@ -3,6 +3,9 @@ name: ReleaseBranchCI
 env:
  # Force the stdout and stderr streams to be unbuffered
  PYTHONUNBUFFERED: 1
+  # Export system tables to ClickHouse Cloud
+  CLICKHOUSE_CI_LOGS_HOST: ${{ secrets.CLICKHOUSE_CI_LOGS_HOST }}
+  CLICKHOUSE_CI_LOGS_PASSWORD: ${{ secrets.CLICKHOUSE_CI_LOGS_PASSWORD }}

 on: # yamllint disable-line rule:truthy
  push:
--- a/.gitmodules
+++ b/.gitmodules
@ -331,6 +331,10 @@
 [submodule "contrib/liburing"]
 	path = contrib/liburing
 	url = https://github.com/axboe/liburing
+[submodule "contrib/libarchive"]
+	path = contrib/libarchive
+	url = https://github.com/libarchive/libarchive.git
+	ignore = dirty
 [submodule "contrib/libfiu"]
 	path = contrib/libfiu
 	url = https://github.com/ClickHouse/libfiu.git
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -52,7 +52,6 @@
 * Add new setting `disable_url_encoding` that allows to disable decoding/encoding path in uri in URL engine. [#52337](https://github.com/ClickHouse/ClickHouse/pull/52337) ([Kruglov Pavel](https://github.com/Avogar)).

 #### Performance Improvement
-* Writing parquet files is 10x faster, it's multi-threaded now. Almost the same speed as reading. [#49367](https://github.com/ClickHouse/ClickHouse/pull/49367) ([Michael Kolupaev](https://github.com/al13n321)).
 * Enable automatic selection of the sparse serialization format by default. It improves performance. The format is supported since version 22.1. After this change, downgrading to versions older than 22.1 might not be possible. You can turn off the usage of the sparse serialization format by providing the `ratio_of_defaults_for_sparse_serialization = 1` setting for your MergeTree tables. [#49631](https://github.com/ClickHouse/ClickHouse/pull/49631) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
 * Enable `move_all_conditions_to_prewhere` and `enable_multiple_prewhere_read_steps` settings by default. [#46365](https://github.com/ClickHouse/ClickHouse/pull/46365) ([Alexander Gololobov](https://github.com/davenger)).
 * Improves performance of some queries by tuning allocator. [#46416](https://github.com/ClickHouse/ClickHouse/pull/46416) ([Azat Khuzhin](https://github.com/azat)).
@ -114,6 +113,7 @@
 * Now interserver port will be closed only after tables are shut down. [#52498](https://github.com/ClickHouse/ClickHouse/pull/52498) ([alesapin](https://github.com/alesapin)).

 #### Experimental Feature
+* Writing parquet files is 10x faster, it's multi-threaded now. Almost the same speed as reading. [#49367](https://github.com/ClickHouse/ClickHouse/pull/49367) ([Michael Kolupaev](https://github.com/al13n321)). This is controlled by the setting `output_format_parquet_use_custom_encoder` which is disabled by default, because the feature is non-ideal.
 * Added support for [PRQL](https://prql-lang.org/) as a query language. [#50686](https://github.com/ClickHouse/ClickHouse/pull/50686) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
 * Allow to add disk name for custom disks. Previously custom disks would use an internal generated disk name. Now it will be possible with `disk = disk_<name>(...)` (e.g. disk will have name `name`) . [#51552](https://github.com/ClickHouse/ClickHouse/pull/51552) ([Kseniia Sumarokova](https://github.com/kssenii)). This syntax can be changed in this release.
 * (experimental MaterializedMySQL) Fixed crash when `mysqlxx::Pool::Entry` is used after it was disconnected. [#52063](https://github.com/ClickHouse/ClickHouse/pull/52063) ([Val Doroshchuk](https://github.com/valbok)).
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -92,6 +92,7 @@ add_contrib (google-protobuf-cmake google-protobuf)
 add_contrib (openldap-cmake openldap)
 add_contrib (grpc-cmake grpc)
 add_contrib (msgpack-c-cmake msgpack-c)
+add_contrib (libarchive-cmake libarchive)

 add_contrib (corrosion-cmake corrosion)

--- a/contrib/libarchive
+++ b/contrib/libarchive
@ -0,0 +1 @@
+Subproject commit ee45796171324519f0c0bfd012018dd099296336
--- a/contrib/libarchive-cmake/CMakeLists.txt
+++ b/contrib/libarchive-cmake/CMakeLists.txt
@ -0,0 +1,172 @@
+set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/libarchive")
+
+set(SRCS 
+    "${LIBRARY_DIR}/libarchive/archive_acl.c"
+    "${LIBRARY_DIR}/libarchive/archive_blake2sp_ref.c"
+    "${LIBRARY_DIR}/libarchive/archive_blake2s_ref.c"
+    "${LIBRARY_DIR}/libarchive/archive_check_magic.c"
+    "${LIBRARY_DIR}/libarchive/archive_cmdline.c"
+    "${LIBRARY_DIR}/libarchive/archive_cryptor.c"
+    "${LIBRARY_DIR}/libarchive/archive_digest.c"
+    "${LIBRARY_DIR}/libarchive/archive_disk_acl_darwin.c"
+    "${LIBRARY_DIR}/libarchive/archive_disk_acl_freebsd.c"
+    "${LIBRARY_DIR}/libarchive/archive_disk_acl_linux.c"
+    "${LIBRARY_DIR}/libarchive/archive_disk_acl_sunos.c"
+    "${LIBRARY_DIR}/libarchive/archive_entry.c"
+    "${LIBRARY_DIR}/libarchive/archive_entry_copy_bhfi.c"
+    "${LIBRARY_DIR}/libarchive/archive_entry_copy_stat.c"
+    "${LIBRARY_DIR}/libarchive/archive_entry_link_resolver.c"
+    "${LIBRARY_DIR}/libarchive/archive_entry_sparse.c"
+    "${LIBRARY_DIR}/libarchive/archive_entry_stat.c"
+    "${LIBRARY_DIR}/libarchive/archive_entry_strmode.c"
+    "${LIBRARY_DIR}/libarchive/archive_entry_xattr.c"
+    "${LIBRARY_DIR}/libarchive/archive_getdate.c"
+    "${LIBRARY_DIR}/libarchive/archive_hmac.c"
+    "${LIBRARY_DIR}/libarchive/archive_match.c"
+    "${LIBRARY_DIR}/libarchive/archive_options.c"
+    "${LIBRARY_DIR}/libarchive/archive_pack_dev.c"
+    "${LIBRARY_DIR}/libarchive/archive_pathmatch.c"
+    "${LIBRARY_DIR}/libarchive/archive_ppmd7.c"
+    "${LIBRARY_DIR}/libarchive/archive_ppmd8.c"
+    "${LIBRARY_DIR}/libarchive/archive_random.c"
+    "${LIBRARY_DIR}/libarchive/archive_rb.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_add_passphrase.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_append_filter.c"
+    "${LIBRARY_DIR}/libarchive/archive_read.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_data_into_fd.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_disk_entry_from_file.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_disk_posix.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_disk_set_standard_lookup.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_disk_windows.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_extract2.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_extract.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_open_fd.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_open_file.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_open_filename.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_open_memory.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_set_format.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_set_options.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_all.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_by_code.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_bzip2.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_compress.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_grzip.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_gzip.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_lrzip.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_lz4.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_lzop.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_none.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_program.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_rpm.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_uu.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_xz.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_filter_zstd.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_7zip.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_all.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_ar.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_by_code.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_cab.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_cpio.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_empty.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_iso9660.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_lha.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_mtree.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_rar5.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_rar.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_raw.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_tar.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_warc.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_xar.c"
+    "${LIBRARY_DIR}/libarchive/archive_read_support_format_zip.c"
+    "${LIBRARY_DIR}/libarchive/archive_string.c"
+    "${LIBRARY_DIR}/libarchive/archive_string_sprintf.c"
+    "${LIBRARY_DIR}/libarchive/archive_util.c"
+    "${LIBRARY_DIR}/libarchive/archive_version_details.c"
+    "${LIBRARY_DIR}/libarchive/archive_virtual.c"
+    "${LIBRARY_DIR}/libarchive/archive_windows.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_b64encode.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_by_name.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_bzip2.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_compress.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_grzip.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_gzip.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_lrzip.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_lz4.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_lzop.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_none.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_program.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_uuencode.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_xz.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_add_filter_zstd.c"
+    "${LIBRARY_DIR}/libarchive/archive_write.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_disk_posix.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_disk_set_standard_lookup.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_disk_windows.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_open_fd.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_open_file.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_open_filename.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_open_memory.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_7zip.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_ar.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_by_name.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_cpio_binary.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_cpio.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_cpio_newc.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_cpio_odc.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_filter_by_ext.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_gnutar.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_iso9660.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_mtree.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_pax.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_raw.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_shar.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_ustar.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_v7tar.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_warc.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_xar.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_format_zip.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_options.c"
+    "${LIBRARY_DIR}/libarchive/archive_write_set_passphrase.c"
+    "${LIBRARY_DIR}/libarchive/filter_fork_posix.c"
+    "${LIBRARY_DIR}/libarchive/filter_fork_windows.c"
+    "${LIBRARY_DIR}/libarchive/xxhash.c"
+)
+
+add_library(_libarchive ${SRCS})
+target_include_directories(_libarchive PUBLIC 
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    "${LIBRARY_DIR}/libarchive"
+)
+
+target_compile_definitions(_libarchive PUBLIC
+    HAVE_CONFIG_H
+)
+
+target_compile_options(_libarchive PRIVATE "-Wno-reserved-macro-identifier")
+
+if (TARGET ch_contrib::xz)
+    target_compile_definitions(_libarchive PUBLIC HAVE_LZMA_H=1)
+    target_link_libraries(_libarchive PRIVATE ch_contrib::xz)
+endif()
+
+if (TARGET ch_contrib::zlib)
+    target_compile_definitions(_libarchive PUBLIC HAVE_ZLIB_H=1)
+    target_link_libraries(_libarchive PRIVATE ch_contrib::zlib)
+endif()
+
+if (OS_LINUX)
+    target_compile_definitions(
+        _libarchive PUBLIC
+            MAJOR_IN_SYSMACROS=1
+            HAVE_LINUX_FS_H=1
+            HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC=1
+            HAVE_LINUX_TYPES_H=1
+            HAVE_SYS_STATFS_H=1
+            HAVE_FUTIMESAT=1
+            HAVE_ICONV=1
+    )
+endif()
+
+add_library(ch_contrib::libarchive ALIAS _libarchive)
--- a/contrib/libarchive-cmake/config.h
+++ b/contrib/libarchive-cmake/config.h
--- a/docker/keeper/Dockerfile
+++ b/docker/keeper/Dockerfile
@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \
    esac

 ARG REPOSITORY="https://s3.amazonaws.com/clickhouse-builds/22.4/31c367d3cd3aefd316778601ff6565119fe36682/package_release"
-ARG VERSION="23.7.3.14"
+ARG VERSION="23.7.4.5"
 ARG PACKAGES="clickhouse-keeper"

 # user/group precreated explicitly with fixed uid/gid on purpose.
--- a/docker/packager/binary/Dockerfile
+++ b/docker/packager/binary/Dockerfile
@ -101,6 +101,7 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \
        python3-boto3 \
        yasm \
        zstd \
+        jq \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists

--- a/docker/packager/binary/build.sh
+++ b/docker/packager/binary/build.sh
@ -59,7 +59,7 @@ if [ "$BUILD_MUSL_KEEPER" == "1" ]
 then
    # build keeper with musl separately
    # and without rust bindings
-    cmake --debug-trycompile -DENABLE_RUST=OFF -DBUILD_STANDALONE_KEEPER=1 -DENABLE_CLICKHOUSE_KEEPER=1 -DCMAKE_VERBOSE_MAKEFILE=1 -DUSE_MUSL=1 -LA -DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-x86_64-musl.cmake "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 "${CMAKE_FLAGS[@]}" ..
+    cmake --debug-trycompile -DENABLE_RUST=OFF -DBUILD_STANDALONE_KEEPER=1 -DENABLE_CLICKHOUSE_KEEPER=1 -DCMAKE_VERBOSE_MAKEFILE=1 -DUSE_MUSL=1 -LA -DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-x86_64-musl.cmake "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 -DENABLE_BUILD_PROFILING=1 "${CMAKE_FLAGS[@]}" ..
    # shellcheck disable=SC2086 # No quotes because I want it to expand to nothing if empty.
    ninja $NINJA_FLAGS clickhouse-keeper

@ -74,10 +74,10 @@ then
    rm -f CMakeCache.txt

    # Build the rest of binaries
-    cmake --debug-trycompile -DBUILD_STANDALONE_KEEPER=0 -DCREATE_KEEPER_SYMLINK=0 -DCMAKE_VERBOSE_MAKEFILE=1 -LA "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 "${CMAKE_FLAGS[@]}" ..
+    cmake --debug-trycompile -DBUILD_STANDALONE_KEEPER=0 -DCREATE_KEEPER_SYMLINK=0 -DCMAKE_VERBOSE_MAKEFILE=1 -LA "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 -DENABLE_BUILD_PROFILING=1 "${CMAKE_FLAGS[@]}" ..
 else
    # Build everything
-    cmake --debug-trycompile -DCMAKE_VERBOSE_MAKEFILE=1 -LA "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 "${CMAKE_FLAGS[@]}" ..
+    cmake --debug-trycompile -DCMAKE_VERBOSE_MAKEFILE=1 -LA "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 -DENABLE_BUILD_PROFILING=1 "${CMAKE_FLAGS[@]}" ..
 fi

 # No quotes because I want it to expand to nothing if empty.
@ -181,4 +181,11 @@ then
    tar -cv -I pixz -f /output/ccache.log.txz "$CCACHE_LOGFILE"
 fi

+# Prepare profile info (time-trace)
+mkdir -p profile-tmp
+../utils/prepare-time-trace/prepare-time-trace.sh . profile-tmp
+find profile-tmp -type f -print0 | xargs -0 cat > /profile/profile.json
+
+wc -c /profile/profile.json
+
 ls -l /output
--- a/docker/packager/packager
+++ b/docker/packager/packager
@ -78,11 +78,14 @@ def run_docker_image_with_env(
    image_name: str,
    as_root: bool,
    output_dir: Path,
+    profile_dir: Path,
    env_variables: List[str],
    ch_root: Path,
    ccache_dir: Optional[Path],
 ):
    output_dir.mkdir(parents=True, exist_ok=True)
+    profile_dir.mkdir(parents=True, exist_ok=True)
+
    env_part = " -e ".join(env_variables)
    if env_part:
        env_part = " -e " + env_part
@ -103,7 +106,7 @@ def run_docker_image_with_env(

    cmd = (
        f"docker run --network=host --user={user} --rm {ccache_mount}"
-        f"--volume={output_dir}:/output --volume={ch_root}:/build {env_part} "
+        f"--volume={output_dir}:/output --volume={ch_root}:/build --volume={profile_dir}:/profile {env_part} "
        f"{interactive} {image_name}"
    )

@ -361,6 +364,7 @@ def parse_args() -> argparse.Namespace:
        help="ClickHouse git repository",
    )
    parser.add_argument("--output-dir", type=dir_name, required=True)
+    parser.add_argument("--profile-dir", type=dir_name, required=True)
    parser.add_argument("--debug-build", action="store_true")

    parser.add_argument(
@ -488,6 +492,7 @@ def main():
        image_with_version,
        args.as_root,
        args.output_dir,
+        args.profile_dir,
        env_prepared,
        ch_root,
        args.ccache_dir,
--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@ -33,7 +33,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="23.7.3.14"
+ARG VERSION="23.7.4.5"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"

 # user/group precreated explicitly with fixed uid/gid on purpose.
--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@ -23,7 +23,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list

 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
-ARG VERSION="23.7.3.14"
+ARG VERSION="23.7.4.5"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"

 # set non-empty deb_location_url url to create a docker image
--- a/docker/test/base/Dockerfile
+++ b/docker/test/base/Dockerfile
@ -19,13 +19,13 @@ RUN apt-get update \
 # and MEMORY_LIMIT_EXCEEDED exceptions in Functional tests (total memory limit in Functional tests is ~55.24 GiB).
 # TSAN will flush shadow memory when reaching this limit.
 # It may cause false-negatives, but it's better than OOM.
-RUN echo "TSAN_OPTIONS='verbosity=1000 halt_on_error=1 history_size=7 memory_limit_mb=46080 second_deadlock_stack=1'" >> /etc/environment
+RUN echo "TSAN_OPTIONS='verbosity=1000 halt_on_error=1 abort_on_error=1 history_size=7 memory_limit_mb=46080 second_deadlock_stack=1'" >> /etc/environment
 RUN echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment
 RUN echo "MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1'" >> /etc/environment
 RUN echo "LSAN_OPTIONS='suppressions=/usr/share/clickhouse-test/config/lsan_suppressions.txt'" >> /etc/environment
 # Sanitizer options for current shell (not current, but the one that will be spawned on "docker run")
 # (but w/o verbosity for TSAN, otherwise test.reference will not match)
-ENV TSAN_OPTIONS='halt_on_error=1 history_size=7 memory_limit_mb=46080 second_deadlock_stack=1'
+ENV TSAN_OPTIONS='halt_on_error=1 abort_on_error=1 history_size=7 memory_limit_mb=46080 second_deadlock_stack=1'
 ENV UBSAN_OPTIONS='print_stacktrace=1'
 ENV MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1'

--- a/docker/test/integration/runner/Dockerfile
+++ b/docker/test/integration/runner/Dockerfile
@ -130,7 +130,7 @@ COPY misc/ /misc/

 # Same options as in test/base/Dockerfile
 # (in case you need to override them in tests)
-ENV TSAN_OPTIONS='halt_on_error=1 history_size=7 memory_limit_mb=46080 second_deadlock_stack=1'
+ENV TSAN_OPTIONS='halt_on_error=1 abort_on_error=1 history_size=7 memory_limit_mb=46080 second_deadlock_stack=1'
 ENV UBSAN_OPTIONS='print_stacktrace=1'
 ENV MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1'

--- a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
+++ b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
@ -3,7 +3,7 @@
        <default>
            <allow_introspection_functions>1</allow_introspection_functions>
            <log_queries>1</log_queries>
-            <metrics_perf_events_enabled>1</metrics_perf_events_enabled>
+            <metrics_perf_events_enabled>0</metrics_perf_events_enabled>
            <!--
                If a test takes too long by mistake, the entire test task can
                time out and the author won't get a proper message. Put some cap
--- a/docker/test/performance-comparison/perf.py
+++ b/docker/test/performance-comparison/perf.py
@ -369,6 +369,7 @@ for query_index in queries_to_run:
                        "max_execution_time": args.prewarm_max_query_seconds,
                        "query_profiler_real_time_period_ns": 10000000,
                        "query_profiler_cpu_time_period_ns": 10000000,
+                        "metrics_perf_events_enabled": 1,
                        "memory_profiler_step": "4Mi",
                    },
                )
@ -503,6 +504,7 @@ for query_index in queries_to_run:
                    settings={
                        "query_profiler_real_time_period_ns": 10000000,
                        "query_profiler_cpu_time_period_ns": 10000000,
+                        "metrics_perf_events_enabled": 1,
                    },
                )
                print(
--- a/docker/test/stateless/Dockerfile
+++ b/docker/test/stateless/Dockerfile
@ -41,6 +41,8 @@ RUN apt-get update -y \
            zstd \
            file \
            pv \
+            zip \
+            p7zip-full \
    && apt-get clean

 RUN pip3 install numpy scipy pandas Jinja2
--- a/docs/changelogs/v23.7.4.5-stable.md
+++ b/docs/changelogs/v23.7.4.5-stable.md
@ -0,0 +1,17 @@
+---
+sidebar_position: 1
+sidebar_label: 2023
+---
+
+# 2023 Changelog
+
+### ClickHouse release v23.7.4.5-stable (bd2fcd44553) FIXME as compared to v23.7.3.14-stable (bd9a510550c)
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+
+* Disable the new parquet encoder [#53130](https://github.com/ClickHouse/ClickHouse/pull/53130) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Revert changes in `ZstdDeflatingAppendableWriteBuffer` [#53111](https://github.com/ClickHouse/ClickHouse/pull/53111) ([Antonio Andelic](https://github.com/antonio2368)).
+
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@ -42,20 +42,20 @@ sudo apt-get install git cmake ccache python3 ninja-build nasm yasm gawk lsb-rel

 ### Install and Use the Clang compiler

-On Ubuntu/Debian you can use LLVM's automatic installation script, see [here](https://apt.llvm.org/).
+On Ubuntu/Debian, you can use LLVM's automatic installation script; see [here](https://apt.llvm.org/).

 ``` bash
 sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
 ```

-Note: in case of troubles, you can also use this:
+Note: in case of trouble, you can also use this:

 ```bash
 sudo apt-get install software-properties-common
 sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
 ```

-For other Linux distribution - check the availability of LLVM's [prebuild packages](https://releases.llvm.org/download.html).
+For other Linux distributions - check the availability of LLVM's [prebuild packages](https://releases.llvm.org/download.html).

 As of April 2023, clang-16 or higher will work.
 GCC as a compiler is not supported.
@ -92,8 +92,12 @@ cmake -S . -B build
 cmake --build build  # or: `cd build; ninja`
 ```

+:::tip
+In case `cmake` isn't able to detect the number of available logical cores, the build will be done by one thread. To overcome this, you can tweak `cmake` to use a specific number of threads with `-j` flag, for example, `cmake --build build -j 16`. Alternatively, you can generate build files with a specific number of jobs in advance to avoid always setting the flag: `cmake -DPARALLEL_COMPILE_JOBS=16 -S . -B build`, where `16` is the desired number of threads.
+:::
+
 To create an executable, run `cmake --build build --target clickhouse` (or: `cd build; ninja clickhouse`).
-This will create executable `build/programs/clickhouse` which can be used with `client` or `server` arguments.
+This will create an executable `build/programs/clickhouse`, which can be used with `client` or `server` arguments.

 ## Building on Any Linux {#how-to-build-clickhouse-on-any-linux}

@ -107,7 +111,7 @@ The build requires the following components:
 - Yasm
 - Gawk

-If all the components are installed, you may build in the same way as the steps above.
+If all the components are installed, you may build it in the same way as the steps above.

 Example for OpenSUSE Tumbleweed:

@ -123,7 +127,7 @@ Example for Fedora Rawhide:

 ``` bash
 sudo yum update
-sudo yum --nogpg install git cmake make clang python3 ccache nasm yasm gawk
+sudo yum --nogpg install git cmake make clang python3 ccache lld nasm yasm gawk
 git clone --recursive https://github.com/ClickHouse/ClickHouse.git
 mkdir build
 cmake -S . -B build
--- a/docs/en/engines/table-engines/special/buffer.md
+++ b/docs/en/engines/table-engines/special/buffer.md
@ -13,7 +13,7 @@ A recommended alternative to the Buffer Table Engine is enabling [asynchronous i
 :::

 ``` sql
-Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes)
+Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes [,flush_time [,flush_rows [,flush_bytes]]])
 ```

 ### Engine parameters:
--- a/docs/en/operations/backup.md
+++ b/docs/en/operations/backup.md
@ -84,6 +84,7 @@ The BACKUP and RESTORE statements take a list of DATABASE and TABLE names, a des
    - `password` for the file on disk
    - `base_backup`: the destination of the previous backup of this source.  For example, `Disk('backups', '1.zip')`
    - `structure_only`: if enabled, allows to only backup or restore the CREATE statements without the data of tables
+    - `storage_policy`: storage policy for the tables being restored. See [Using Multiple Block Devices for Data Storage](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes). This setting is only applicable to the `RESTORE` command. The specified storage policy applies only to tables with an engine from the `MergeTree` family.
    - `s3_storage_class`: the storage class used for S3 backup. For example, `STANDARD`

 ### Usage examples
--- a/docs/en/operations/settings/index.md
+++ b/docs/en/operations/settings/index.md
@ -7,6 +7,10 @@ pagination_next: en/operations/settings/settings

 # Settings Overview

+:::note
+XML-based Settings Profiles and [configuration files](https://clickhouse.com/docs/en/operations/configuration-files) are currently not supported for ClickHouse Cloud. To specify settings for your ClickHouse Cloud service, you must use [SQL-driven Settings Profiles](https://clickhouse.com/docs/en/operations/access-rights#settings-profiles-management).
+:::
+
 There are two main groups of ClickHouse settings:

 - Global server settings
--- a/docs/en/operations/settings/query-complexity.md
+++ b/docs/en/operations/settings/query-complexity.md
@ -298,7 +298,7 @@ Default value: `THROW`.
 - [JOIN clause](../../sql-reference/statements/select/join.md#select-join)
 - [Join table engine](../../engines/table-engines/special/join.md)

-## max_partitions_per_insert_block {#max-partitions-per-insert-block}
+## max_partitions_per_insert_block {#settings-max_partitions_per_insert_block}

 Limits the maximum number of partitions in a single inserted block.

@ -309,9 +309,18 @@ Default value: 100.

 **Details**

-When inserting data, ClickHouse calculates the number of partitions in the inserted block. If the number of partitions is more than `max_partitions_per_insert_block`, ClickHouse throws an exception with the following text:
+When inserting data, ClickHouse calculates the number of partitions in the inserted block. If the number of partitions is more than `max_partitions_per_insert_block`, ClickHouse either logs a warning or throws an exception based on `throw_on_max_partitions_per_insert_block`. Exceptions have the following text:

-> “Too many partitions for single INSERT block (more than” + toString(max_parts) + “). The limit is controlled by ‘max_partitions_per_insert_block’ setting. A large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc).”
+> “Too many partitions for a single INSERT block (`partitions_count` partitions, limit is ” + toString(max_partitions) + “). The limit is controlled by the ‘max_partitions_per_insert_block’ setting. A large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc).”
+
+## throw_on_max_partitions_per_insert_block {#settings-throw_on_max_partition_per_insert_block}
+
+Allows you to control behaviour when `max_partitions_per_insert_block` is reached.
+
+- `true`  - When an insert block reaches `max_partitions_per_insert_block`, an exception is raised.
+- `false` - Logs a warning when `max_partitions_per_insert_block` is reached.
+
+Default value: `true`

 ## max_temporary_data_on_disk_size_for_user {#settings_max_temporary_data_on_disk_size_for_user}

--- a/docs/en/operations/utilities/clickhouse-keeper-client.md
+++ b/docs/en/operations/utilities/clickhouse-keeper-client.md
@ -11,7 +11,7 @@ A client application to interact with clickhouse-keeper by its native protocol.

 -   `-q QUERY`, `--query=QUERY` — Query to execute. If this parameter is not passed, `clickhouse-keeper-client` will start in interactive mode.
 -   `-h HOST`, `--host=HOST` — Server host. Default value: `localhost`.
-   `-p N`, `--port=N` — Server port. Default value: 2181
+-   `-p N`, `--port=N` — Server port. Default value: 9181
 -   `--connection-timeout=TIMEOUT` — Set connection timeout in seconds. Default value: 10s.
 -   `--session-timeout=TIMEOUT` — Set session timeout in seconds. Default value: 10s.
 -   `--operation-timeout=TIMEOUT` — Set operation timeout in seconds. Default value: 10s.
@ -21,8 +21,8 @@ A client application to interact with clickhouse-keeper by its native protocol.
 ## Example {#clickhouse-keeper-client-example}

 ```bash
-./clickhouse-keeper-client -h localhost:2181 --connection-timeout 30 --session-timeout 30 --operation-timeout 30
-Connected to ZooKeeper at [::1]:2181 with session_id 137
+./clickhouse-keeper-client -h localhost:9181 --connection-timeout 30 --session-timeout 30 --operation-timeout 30
+Connected to ZooKeeper at [::1]:9181 with session_id 137
 / :) ls
 keeper foo bar
 / :) cd keeper
--- a/docs/en/sql-reference/table-functions/azureBlobStorageCluster.md
+++ b/docs/en/sql-reference/table-functions/azureBlobStorageCluster.md
@ -0,0 +1,47 @@
+---
+slug: /en/sql-reference/table-functions/azureBlobStorageCluster
+sidebar_position: 55
+sidebar_label: azureBlobStorageCluster
+title: "azureBlobStorageCluster Table Function"
+---
+
+Allows processing files from [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs) in parallel from many nodes in a specified cluster. On initiator it creates a connection to all nodes in the cluster, discloses asterisks in S3 file path, and dispatches each file dynamically. On the worker node it asks the initiator about the next task to process and processes it. This is repeated until all tasks are finished.
+This table function is similar to the [s3Cluster function](../../sql-reference/table-functions/s3Cluster.md).
+
+**Syntax**
+
+``` sql
+azureBlobStorageCluster(cluster_name, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])
+```
+
+**Arguments**
+
+- `cluster_name` — Name of a cluster that is used to build a set of addresses and connection parameters to remote and local servers.
+- `connection_string|storage_account_url` — connection_string includes account name & key ([Create connection string](https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&bc=%2Fazure%2Fstorage%2Fblobs%2Fbreadcrumb%2Ftoc.json#configure-a-connection-string-for-an-azure-storage-account)) or you could also provide the storage account url here and account name & account key as separate parameters (see parameters account_name & account_key)
+- `container_name` - Container name
+- `blobpath` - file path. Supports following wildcards in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings.
+- `account_name` - if storage_account_url is used, then account name can be specified here
+- `account_key` - if storage_account_url is used, then account key can be specified here
+- `format` — The [format](../../interfaces/formats.md#formats) of the file.
+- `compression` — Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. By default, it will autodetect compression by file extension. (same as setting to `auto`).
+- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`.
+
+**Returned value**
+
+A table with the specified structure for reading or writing data in the specified file.
+
+**Examples**
+
+Select the count for the file `test_cluster_*.csv`, using all the nodes in the `cluster_simple` cluster:
+
+``` sql
+SELECT count(*) from azureBlobStorageCluster(
+        'cluster_simple', 'http://azurite1:10000/devstoreaccount1', 'test_container', 'test_cluster_count.csv', 'devstoreaccount1',
+        'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV',
+        'auto', 'key UInt64')
+```
+
+**See Also**
+
+- [AzureBlobStorage engine](../../engines/table-engines/integrations/azureBlobStorage.md)
+- [azureBlobStorage table function](../../sql-reference/table-functions/azureBlobStorage.md)
--- a/docs/en/sql-reference/table-functions/file.md
+++ b/docs/en/sql-reference/table-functions/file.md
@ -13,16 +13,18 @@ The `file` function can be used in `SELECT` and `INSERT` queries to read from or
 **Syntax**

 ``` sql
-file(path [,format] [,structure] [,compression])
+file([path_to_archive ::] path [,format] [,structure] [,compression])
 ```

 **Parameters**

 - `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file support following globs in read-only mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc', 'def'` — strings.
+- `path_to_archive` - The relative path to zip/tar/7z archive. Path to archive support the same globs as `path`.
 - `format` — The [format](/docs/en/interfaces/formats.md#formats) of the file.
 - `structure` — Structure of the table. Format: `'column1_name column1_type, column2_name column2_type, ...'`.
 - `compression` — The existing compression type when used in a `SELECT` query, or the desired compression type when used in an `INSERT` query.  The supported compression types are `gz`, `br`, `xz`, `zst`, `lz4`, and `bz2`.

+
 **Returned value**

 A table with the specified structure for reading or writing data in the specified file.
@ -128,6 +130,11 @@ file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32');
 └─────────┴─────────┴─────────┘
 ```

+Getting data from table in table.csv, located in archive1.zip or/and archive2.zip
+``` sql
+SELECT * FROM file('user_files/archives/archive{1..2}.zip :: table.csv');
+```
+
 ## Globs in Path

 Multiple path components can have globs. For being processed file must exist and match to the whole path pattern (not only suffix or prefix).
--- a/docs/ru/engines/table-engines/special/buffer.md
+++ b/docs/ru/engines/table-engines/special/buffer.md
@ -9,7 +9,7 @@ sidebar_label: Buffer
 Буферизует записываемые данные в оперативке, периодически сбрасывая их в другую таблицу. При чтении, производится чтение данных одновременно из буфера и из другой таблицы.

 ``` sql
-Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes)
+Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes [,flush_time [,flush_rows [,flush_bytes]]])
 ```

 Параметры движка:
--- a/docs/ru/operations/settings/query-complexity.md
+++ b/docs/ru/operations/settings/query-complexity.md
@ -311,9 +311,18 @@ FORMAT Null;

 **Подробности**

-При вставке данных, ClickHouse вычисляет количество партиций во вставленном блоке. Если число партиций больше, чем `max_partitions_per_insert_block`, ClickHouse генерирует исключение со следующим текстом:
+При вставке данных ClickHouse проверяет количество партиций во вставляемом блоке. Если количество разделов превышает число `max_partitions_per_insert_block`, ClickHouse либо логирует предупреждение, либо выбрасывает исключение в зависимости от значения `throw_on_max_partitions_per_insert_block`.  Исключения имеют следующий текст:

-> «Too many partitions for single INSERT block (more than» + toString(max_parts) + «). The limit is controlled by ‘max_partitions_per_insert_block’ setting. Large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc).»
+> “Too many partitions for a single INSERT block (`partitions_count` partitions, limit is ” + toString(max_partitions) + “). The limit is controlled by the ‘max_partitions_per_insert_block’ setting. A large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc).”
+
+## throw_on_max_partitions_per_insert_block {#settings-throw_on_max_partition_per_insert_block}
+
+Позволяет контролировать поведение при достижении `max_partitions_per_insert_block`
+
+- `true`  - Когда вставляемый блок достигает `max_partitions_per_insert_block`, возникает исключение.
+- `false` - Записывает предупреждение при достижении `max_partitions_per_insert_block`.
+
+Значение по умолчанию: `true`

 ## max_sessions_for_user {#max-sessions-per-user}

--- a/docs/zh/engines/table-engines/special/buffer.md
+++ b/docs/zh/engines/table-engines/special/buffer.md
@ -5,7 +5,7 @@ slug: /zh/engines/table-engines/special/buffer

 缓冲数据写入 RAM 中，周期性地将数据刷新到另一个表。在读取操作时，同时从缓冲区和另一个表读取数据。

-    Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes)
+    Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes [,flush_time [,flush_rows [,flush_bytes]]])

 引擎的参数：database，table - 要刷新数据的表。可以使用返回字符串的常量表达式而不是数据库名称。 num_layers - 并行层数。在物理上，该表将表示为 num_layers 个独立缓冲区。建议值为16。min_time，max_time，min_rows，max_rows，min_bytes，max_bytes - 从缓冲区刷新数据的条件。

--- a/programs/keeper-client/KeeperClient.cpp
+++ b/programs/keeper-client/KeeperClient.cpp
@ -131,7 +131,7 @@ void KeeperClient::defineOptions(Poco::Util::OptionSet & options)
            .binding("host"));

    options.addOption(
-        Poco::Util::Option("port", "p", "server port. default `2181`")
+        Poco::Util::Option("port", "p", "server port. default `9181`")
            .argument("<port>")
            .binding("port"));

@ -307,7 +307,7 @@ int KeeperClient::main(const std::vector<String> & /* args */)
    }

    auto host = config().getString("host", "localhost");
-    auto port = config().getString("port", "2181");
+    auto port = config().getString("port", "9181");
    zk_args.hosts = {host + ":" + port};
    zk_args.connection_timeout_ms = config().getInt("connection-timeout", 10) * 1000;
    zk_args.session_timeout_ms = config().getInt("session-timeout", 10) * 1000;
--- a/src/Access/LDAPClient.cpp
+++ b/src/Access/LDAPClient.cpp
@ -18,7 +18,8 @@
 namespace
 {

-template <typename T, typename = std::enable_if_t<std::is_fundamental_v<std::decay_t<T>>>>
+template <typename T>
+requires std::is_fundamental_v<std::decay_t<T>>
 void updateHash(SipHash & hash, const T & value)
 {
    hash.update(value);
--- a/src/AggregateFunctions/AggregateFunctionFlameGraph.cpp
+++ b/src/AggregateFunctions/AggregateFunctionFlameGraph.cpp
@ -0,0 +1,646 @@
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/FactoryHelpers.h>
+#include <Common/HashTable/HashMap.h>
+#include <Common/SymbolIndex.h>
+#include <Common/ArenaAllocator.h>
+#include <Core/Settings.h>
+#include <Columns/ColumnArray.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnsNumber.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <IO/WriteHelpers.h>
+#include <IO/Operators.h>
+#include <filesystem>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int FUNCTION_NOT_ALLOWED;
+    extern const int NOT_IMPLEMENTED;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+struct AggregateFunctionFlameGraphTree
+{
+    struct ListNode;
+
+    struct TreeNode
+    {
+        TreeNode * parent = nullptr;
+        ListNode * children = nullptr;
+        UInt64 ptr = 0;
+        size_t allocated = 0;
+    };
+
+    struct ListNode
+    {
+        ListNode * next = nullptr;
+        TreeNode * child = nullptr;
+    };
+
+    TreeNode root;
+
+    static ListNode * createChild(TreeNode * parent, UInt64 ptr, Arena * arena)
+    {
+
+        ListNode * list_node = reinterpret_cast<ListNode *>(arena->alloc(sizeof(ListNode)));
+        TreeNode * tree_node = reinterpret_cast<TreeNode *>(arena->alloc(sizeof(TreeNode)));
+
+        list_node->child = tree_node;
+        list_node->next = nullptr;
+
+        tree_node->parent =parent;
+        tree_node->children = nullptr;
+        tree_node->ptr = ptr;
+        tree_node->allocated = 0;
+
+        return list_node;
+    }
+
+    TreeNode * find(const UInt64 * stack, size_t stack_size, Arena * arena)
+    {
+        TreeNode * node = &root;
+        for (size_t i = 0; i < stack_size; ++i)
+        {
+            UInt64 ptr = stack[i];
+            if (ptr == 0)
+                break;
+
+            if (!node->children)
+            {
+                node->children = createChild(node, ptr, arena);
+                node = node->children->child;
+            }
+            else
+            {
+                ListNode * list = node->children;
+                while (list->child->ptr != ptr && list->next)
+                    list = list->next;
+
+                if (list->child->ptr != ptr)
+                {
+                    list->next = createChild(node, ptr, arena);
+                    list = list->next;
+                }
+
+                node = list->child;
+            }
+        }
+
+        return node;
+    }
+
+    static void append(DB::PaddedPODArray<UInt64> & values, DB::PaddedPODArray<UInt64> & offsets, std::vector<UInt64> & frame)
+    {
+        UInt64 prev = offsets.empty() ? 0 : offsets.back();
+        offsets.push_back(prev + frame.size());
+        for (UInt64 val : frame)
+            values.push_back(val);
+    }
+
+    struct Trace
+    {
+        using Frames = std::vector<UInt64>;
+
+        Frames frames;
+
+        /// The total number of bytes allocated for traces with the same prefix.
+        size_t allocated_total = 0;
+        /// This counter is relevant in case we want to filter some traces with small amount of bytes.
+        /// It shows the total number of bytes for *filtered* traces with the same prefix.
+        /// This is the value which is used in flamegraph.
+        size_t allocated_self = 0;
+    };
+
+    using Traces = std::vector<Trace>;
+
+    Traces dump(size_t max_depth, size_t min_bytes) const
+    {
+        Traces traces;
+        Trace::Frames frames;
+        std::vector<size_t> allocated_total;
+        std::vector<size_t> allocated_self;
+        std::vector<ListNode *> nodes;
+
+        nodes.push_back(root.children);
+        allocated_total.push_back(root.allocated);
+        allocated_self.push_back(root.allocated);
+
+        while (!nodes.empty())
+        {
+            if (nodes.back() == nullptr)
+            {
+                traces.push_back({frames, allocated_total.back(), allocated_self.back()});
+
+                nodes.pop_back();
+                allocated_total.pop_back();
+                allocated_self.pop_back();
+
+                /// We don't have root's frame so framers are empty in the end.
+                if (!frames.empty())
+                    frames.pop_back();
+
+                continue;
+            }
+
+            TreeNode * current = nodes.back()->child;
+            nodes.back() = nodes.back()->next;
+
+            bool enough_bytes = current->allocated >= min_bytes;
+            bool enough_depth = max_depth == 0 || nodes.size() < max_depth;
+
+            if (enough_bytes)
+            {
+                frames.push_back(current->ptr);
+                allocated_self.back() -= current->allocated;
+
+                if (enough_depth)
+                {
+                    allocated_total.push_back(current->allocated);
+                    allocated_self.push_back(current->allocated);
+                    nodes.push_back(current->children);
+                }
+                else
+                {
+                    traces.push_back({frames, current->allocated, current->allocated});
+                    frames.pop_back();
+                }
+            }
+        }
+
+        return traces;
+    }
+};
+
+static void insertData(DB::PaddedPODArray<UInt8> & chars, DB::PaddedPODArray<UInt64> & offsets, const char * pos, size_t length)
+{
+    const size_t old_size = chars.size();
+    const size_t new_size = old_size + length + 1;
+
+    chars.resize(new_size);
+    if (length)
+        memcpy(chars.data() + old_size, pos, length);
+    chars[old_size + length] = 0;
+    offsets.push_back(new_size);
+}
+
+/// Split str by line feed and write as separate row to ColumnString.
+static void fillColumn(DB::PaddedPODArray<UInt8> & chars, DB::PaddedPODArray<UInt64> & offsets, const std::string & str)
+{
+    size_t start = 0;
+    size_t end = 0;
+    size_t size = str.size();
+
+    while (end < size)
+    {
+        if (str[end] == '\n')
+        {
+            insertData(chars, offsets, str.data() + start, end - start);
+            start = end + 1;
+        }
+
+        ++end;
+    }
+
+    if (start < end)
+        insertData(chars, offsets, str.data() + start, end - start);
+}
+
+void dumpFlameGraph(
+    const AggregateFunctionFlameGraphTree::Traces & traces,
+    DB::PaddedPODArray<UInt8> & chars,
+    DB::PaddedPODArray<UInt64> & offsets)
+{
+    DB::WriteBufferFromOwnString out;
+
+    std::unordered_map<uintptr_t, size_t> mapping;
+
+#if defined(__ELF__) && !defined(OS_FREEBSD)
+    const DB::SymbolIndex & symbol_index = DB::SymbolIndex::instance();
+#endif
+
+    for (const auto & trace : traces)
+    {
+        if (trace.allocated_self == 0)
+            continue;
+
+        for (size_t i = 0; i < trace.frames.size(); ++i)
+        {
+            if (i)
+                out << ";";
+
+            const void * ptr = reinterpret_cast<const void *>(trace.frames[i]);
+
+#if defined(__ELF__) && !defined(OS_FREEBSD)
+            if (const auto * symbol = symbol_index.findSymbol(ptr))
+                writeString(demangle(symbol->name), out);
+            else
+                DB::writePointerHex(ptr, out);
+#else
+            DB::writePointerHex(ptr, out);
+#endif
+        }
+
+        out << ' ' << trace.allocated_self << "\n";
+    }
+
+    fillColumn(chars, offsets, out.str());
+}
+
+struct AggregateFunctionFlameGraphData
+{
+    struct Entry
+    {
+        AggregateFunctionFlameGraphTree::TreeNode * trace;
+        UInt64 size;
+        Entry * next = nullptr;
+    };
+
+    struct Pair
+    {
+        Entry * allocation = nullptr;
+        Entry * deallocation = nullptr;
+    };
+
+    using Entries = HashMap<UInt64, Pair>;
+
+    AggregateFunctionFlameGraphTree tree;
+    Entries entries;
+    Entry * free_list = nullptr;
+
+    Entry * alloc(Arena * arena)
+    {
+        if (free_list)
+        {
+            auto * res = free_list;
+            free_list = free_list->next;
+            return res;
+        }
+
+        return reinterpret_cast<Entry *>(arena->alloc(sizeof(Entry)));
+    }
+
+    void release(Entry * entry)
+    {
+        entry->next = free_list;
+        free_list = entry;
+    }
+
+    static void track(Entry * allocation)
+    {
+        auto * node = allocation->trace;
+        while (node)
+        {
+            node->allocated += allocation->size;
+            node = node->parent;
+        }
+    }
+
+    static void untrack(Entry * allocation)
+    {
+        auto * node = allocation->trace;
+        while (node)
+        {
+            node->allocated -= allocation->size;
+            node = node->parent;
+        }
+    }
+
+    static Entry * tryFindMatchAndRemove(Entry *& list, UInt64 size)
+    {
+        if (!list)
+            return nullptr;
+
+        if (list->size == size)
+        {
+            Entry * entry = list;
+            list = list->next;
+            return entry;
+        }
+        else
+        {
+            Entry * parent = list;
+            while (parent->next && parent->next->size != size)
+                parent = parent->next;
+
+            if (parent->next && parent->next->size == size)
+            {
+                Entry * entry = parent->next;
+                parent->next = entry->next;
+                return entry;
+            }
+
+            return nullptr;
+        }
+    }
+
+    void add(UInt64 ptr, Int64 size, const UInt64 * stack, size_t stack_size, Arena * arena)
+    {
+        /// In case if argument is nullptr, only track allocations.
+        if (ptr == 0)
+        {
+            if (size > 0)
+            {
+                auto * node = tree.find(stack, stack_size, arena);
+                Entry entry{.trace = node, .size = UInt64(size)};
+                track(&entry);
+            }
+
+            return;
+        }
+
+        auto & place = entries[ptr];
+        if (size > 0)
+        {
+            if (auto * deallocation = tryFindMatchAndRemove(place.deallocation, size))
+            {
+                release(deallocation);
+            }
+            else
+            {
+                auto * node = tree.find(stack, stack_size, arena);
+
+                auto * allocation = alloc(arena);
+                allocation->size = UInt64(size);
+                allocation->trace = node;
+
+                track(allocation);
+
+                allocation->next = place.allocation;
+                place.allocation = allocation;
+            }
+        }
+        else if (size < 0)
+        {
+            UInt64 abs_size = -size;
+            if (auto * allocation = tryFindMatchAndRemove(place.allocation, abs_size))
+            {
+                untrack(allocation);
+                release(allocation);
+            }
+            else
+            {
+                auto * deallocation = alloc(arena);
+                deallocation->size = abs_size;
+
+                deallocation->next = place.deallocation;
+                place.deallocation = deallocation;
+            }
+        }
+    }
+
+    void merge(const AggregateFunctionFlameGraphTree & other_tree, Arena * arena)
+    {
+        AggregateFunctionFlameGraphTree::Trace::Frames frames;
+        std::vector<AggregateFunctionFlameGraphTree::ListNode *> nodes;
+
+        nodes.push_back(other_tree.root.children);
+
+        while (!nodes.empty())
+        {
+            if (nodes.back() == nullptr)
+            {
+                nodes.pop_back();
+
+                /// We don't have root's frame so framers are empty in the end.
+                if (!frames.empty())
+                    frames.pop_back();
+
+                continue;
+            }
+
+            AggregateFunctionFlameGraphTree::TreeNode * current = nodes.back()->child;
+            nodes.back() = nodes.back()->next;
+
+            frames.push_back(current->ptr);
+
+            if (current->children)
+                nodes.push_back(current->children);
+            else
+            {
+                if (current->allocated)
+                    add(0, current->allocated, frames.data(), frames.size(), arena);
+
+                frames.pop_back();
+            }
+        }
+    }
+
+    void merge(const AggregateFunctionFlameGraphData & other, Arena * arena)
+    {
+        AggregateFunctionFlameGraphTree::Trace::Frames frames;
+        for (const auto & entry : other.entries)
+        {
+            for (auto * allocation = entry.value.second.allocation; allocation; allocation = allocation->next)
+            {
+                frames.clear();
+                const auto * node = allocation->trace;
+                while (node->ptr)
+                {
+                    frames.push_back(node->ptr);
+                    node = node->parent;
+                }
+
+                std::reverse(frames.begin(), frames.end());
+                add(entry.value.first, allocation->size, frames.data(), frames.size(), arena);
+                untrack(allocation);
+            }
+
+            for (auto * deallocation = entry.value.second.deallocation; deallocation; deallocation = deallocation->next)
+            {
+                add(entry.value.first, -Int64(deallocation->size), nullptr, 0, arena);
+            }
+        }
+
+        merge(other.tree, arena);
+    }
+
+    void dumpFlameGraph(
+        DB::PaddedPODArray<UInt8> & chars,
+        DB::PaddedPODArray<UInt64> & offsets,
+        size_t max_depth, size_t min_bytes) const
+    {
+        DB::dumpFlameGraph(tree.dump(max_depth, min_bytes), chars, offsets);
+    }
+};
+
+/// Aggregate function which builds a flamegraph using the list of stacktraces.
+/// The output is an array of strings which can be used by flamegraph.pl util.
+/// See https://github.com/brendangregg/FlameGraph
+///
+/// Syntax: flameGraph(traces, [size = 1], [ptr = 0])
+/// - trace : Array(UInt64), a stacktrace
+/// - size  : Int64, an allocation size (for memory profiling)
+/// - ptr   : UInt64, an allocation address
+/// In case if ptr != 0, a flameGraph will map allocations (size > 0) and deallocations (size < 0) with the same size and ptr.
+/// Only allocations which were not freed are shown. Not mapped deallocations are ignored.
+///
+/// Usage:
+///
+/// * Build a flamegraph based on CPU query profiler
+/// set query_profiler_cpu_time_period_ns=10000000;
+/// SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
+/// clickhouse client --allow_introspection_functions=1
+///     -q "select arrayJoin(flameGraph(arrayReverse(trace))) from system.trace_log where trace_type = 'CPU' and query_id = 'xxx'"
+///     | ~/dev/FlameGraph/flamegraph.pl  > flame_cpu.svg
+///
+/// * Build a flamegraph based on memory query profiler, showing all allocations
+/// set memory_profiler_sample_probability=1, max_untracked_memory=1;
+/// SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
+/// clickhouse client --allow_introspection_functions=1
+///     -q "select arrayJoin(flameGraph(trace, size)) from system.trace_log where trace_type = 'MemorySample' and query_id = 'xxx'"
+///     | ~/dev/FlameGraph/flamegraph.pl --countname=bytes --color=mem > flame_mem.svg
+///
+/// * Build a flamegraph based on memory query profiler, showing allocations which were not deallocated in query context
+/// set memory_profiler_sample_probability=1, max_untracked_memory=1, use_uncompressed_cache=1, merge_tree_max_rows_to_use_cache=100000000000, merge_tree_max_bytes_to_use_cache=1000000000000;
+/// SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
+/// clickhouse client --allow_introspection_functions=1
+///     -q "select arrayJoin(flameGraph(trace, size, ptr)) from system.trace_log where trace_type = 'MemorySample' and query_id = 'xxx'"
+///     | ~/dev/FlameGraph/flamegraph.pl --countname=bytes --color=mem > flame_mem_untracked.svg
+///
+/// *  Build a flamegraph based on memory query profiler, showing active allocations at the fixed point of time
+/// set memory_profiler_sample_probability=1, max_untracked_memory=1;
+/// SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
+/// 1. Memory usage per second
+/// select event_time, m, formatReadableSize(max(s) as m) from (select event_time, sum(size) over (order by event_time) as s from system.trace_log where query_id = 'xxx' and trace_type = 'MemorySample') group by event_time order by event_time;
+/// 2. Find a time point with maximal memory usage
+/// select argMax(event_time, s), max(s) from (select event_time, sum(size) over (order by event_time) as s from system.trace_log where query_id = 'xxx' and trace_type = 'MemorySample');
+/// 3. Fix active allocations at fixed point of time
+/// clickhouse client --allow_introspection_functions=1
+///      -q "select arrayJoin(flameGraph(trace, size, ptr)) from (select * from system.trace_log where trace_type = 'MemorySample' and query_id = 'xxx' and event_time <= 'yyy' order by event_time)"
+///      | ~/dev/FlameGraph/flamegraph.pl --countname=bytes --color=mem > flame_mem_time_point_pos.svg
+/// 4. Find deallocations at fixed point of time
+/// clickhouse client --allow_introspection_functions=1
+///      -q "select arrayJoin(flameGraph(trace, -size, ptr)) from (select * from system.trace_log where trace_type = 'MemorySample' and query_id = 'xxx' and event_time > 'yyy' order by event_time desc)"
+///      | ~/dev/FlameGraph/flamegraph.pl --countname=bytes --color=mem > flame_mem_time_point_neg.svg
+class AggregateFunctionFlameGraph final : public IAggregateFunctionDataHelper<AggregateFunctionFlameGraphData, AggregateFunctionFlameGraph>
+{
+public:
+    explicit AggregateFunctionFlameGraph(const DataTypes & argument_types_)
+        : IAggregateFunctionDataHelper<AggregateFunctionFlameGraphData, AggregateFunctionFlameGraph>(argument_types_, {}, createResultType())
+    {}
+
+    String getName() const override { return "flameGraph"; }
+
+    static DataTypePtr createResultType()
+    {
+        return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
+    }
+
+    bool allocatesMemoryInArena() const override { return true; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    {
+        const auto & trace = assert_cast<const ColumnArray &>(*columns[0]);
+
+        const auto & trace_offsets = trace.getOffsets();
+        const auto & trace_values = assert_cast<const ColumnUInt64 &>(trace.getData()).getData();
+        UInt64 prev_offset = 0;
+        if (row_num)
+            prev_offset = trace_offsets[row_num - 1];
+        UInt64 trace_size = trace_offsets[row_num] - prev_offset;
+
+        Int64 allocated = 1;
+        if (argument_types.size() >= 2)
+        {
+            const auto & sizes = assert_cast<const ColumnInt64 &>(*columns[1]).getData();
+            allocated = sizes[row_num];
+        }
+
+        UInt64 ptr = 0;
+        if (argument_types.size() >= 3)
+        {
+            const auto & ptrs = assert_cast<const ColumnUInt64 &>(*columns[2]).getData();
+            ptr = ptrs[row_num];
+        }
+
+        this->data(place).add(ptr, allocated, trace_values.data() + prev_offset, trace_size, arena);
+    }
+
+    void addManyDefaults(
+        AggregateDataPtr __restrict /*place*/,
+        const IColumn ** /*columns*/,
+        size_t /*length*/,
+        Arena * /*arena*/) const override
+    {
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    {
+        this->data(place).merge(this->data(rhs), arena);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict, WriteBuffer &, std::optional<size_t> /* version */) const override
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Serialization for function flameGraph is not implemented.");
+    }
+
+    void deserialize(AggregateDataPtr __restrict, ReadBuffer &, std::optional<size_t> /* version */, Arena *) const override
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Deserialization for function flameGraph is not implemented.");
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        auto & array = assert_cast<ColumnArray &>(to);
+        auto & str = assert_cast<ColumnString &>(array.getData());
+
+        this->data(place).dumpFlameGraph(str.getChars(), str.getOffsets(), 0, 0);
+
+        array.getOffsets().push_back(str.size());
+    }
+};
+
+static void check(const std::string & name, const DataTypes & argument_types, const Array & params)
+{
+    assertNoParameters(name, params);
+
+    if (argument_types.empty() || argument_types.size() > 3)
+        throw Exception(
+            ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
+            "Aggregate function {} requires 1 to 3 arguments : trace, [size = 1], [ptr = 0]",
+            name);
+
+    auto ptr_type = std::make_shared<DataTypeUInt64>();
+    auto trace_type = std::make_shared<DataTypeArray>(ptr_type);
+    auto size_type = std::make_shared<DataTypeInt64>();
+
+    if (!argument_types[0]->equals(*trace_type))
+        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+            "First argument (trace) for function {} must be Array(UInt64), but it has type {}",
+            name, argument_types[0]->getName());
+
+    if (argument_types.size() >= 2 && !argument_types[1]->equals(*size_type))
+        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+            "Second argument (size) for function {} must be Int64, but it has type {}",
+            name, argument_types[1]->getName());
+
+    if (argument_types.size() >= 3 && !argument_types[2]->equals(*ptr_type))
+        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+            "Third argument (ptr) for function {} must be UInt64, but it has type {}",
+            name, argument_types[2]->getName());
+}
+
+AggregateFunctionPtr createAggregateFunctionFlameGraph(const std::string & name, const DataTypes & argument_types, const Array & params, const Settings * settings)
+{
+    if (!settings->allow_introspection_functions)
+        throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED,
+        "Introspection functions are disabled, because setting 'allow_introspection_functions' is set to 0");
+
+    check(name, argument_types, params);
+    return std::make_shared<AggregateFunctionFlameGraph>(argument_types);
+}
+
+void registerAggregateFunctionFlameGraph(AggregateFunctionFactory & factory)
+{
+    AggregateFunctionProperties properties = { .returns_default_when_only_null = true, .is_order_dependent = true };
+
+    factory.registerFunction("flameGraph", { createAggregateFunctionFlameGraph, properties });
+}
+
+}
--- a/src/AggregateFunctions/registerAggregateFunctions.cpp
+++ b/src/AggregateFunctions/registerAggregateFunctions.cpp
@ -80,6 +80,7 @@ void registerAggregateFunctionExponentialMovingAverage(AggregateFunctionFactory
 void registerAggregateFunctionSparkbar(AggregateFunctionFactory &);
 void registerAggregateFunctionIntervalLengthSum(AggregateFunctionFactory &);
 void registerAggregateFunctionAnalysisOfVariance(AggregateFunctionFactory &);
+void registerAggregateFunctionFlameGraph(AggregateFunctionFactory &);
 void registerAggregateFunctionKolmogorovSmirnovTest(AggregateFunctionFactory & factory);

 class AggregateFunctionCombinatorFactory;
@ -173,6 +174,7 @@ void registerAggregateFunctions()
        registerAggregateFunctionExponentialMovingAverage(factory);
        registerAggregateFunctionSparkbar(factory);
        registerAggregateFunctionAnalysisOfVariance(factory);
+        registerAggregateFunctionFlameGraph(factory);
        registerAggregateFunctionKolmogorovSmirnovTest(factory);

        registerWindowFunctions(factory);
--- a/src/Analyzer/SetUtils.cpp
+++ b/src/Analyzer/SetUtils.cpp
@ -70,10 +70,13 @@ Block createBlockFromCollection(const Collection & collection, const DataTypes &
    {
        if (columns_size == 1)
        {
-            auto field = convertFieldToType(value, *block_types[0]);
+            auto field = convertFieldToTypeStrict(value, *block_types[0]);
+            if (!field)
+                continue;
+
            bool need_insert_null = transform_null_in && block_types[0]->isNullable();
-            if (!field.isNull() || need_insert_null)
-                columns[0]->insert(std::move(field));
+            if (!field->isNull() || need_insert_null)
+                columns[0]->insert(*field);

            continue;
        }
@ -98,7 +101,11 @@ Block createBlockFromCollection(const Collection & collection, const DataTypes &
        size_t i = 0;
        for (; i < tuple_size; ++i)
        {
-            tuple_values[i] = convertFieldToType(tuple[i], *block_types[i]);
+            auto converted_field = convertFieldToTypeStrict(tuple[i], *block_types[i]);
+            if (!converted_field)
+                break;
+            tuple_values[i] = std::move(*converted_field);
+
            bool need_insert_null = transform_null_in && block_types[i]->isNullable();
            if (tuple_values[i].isNull() && !need_insert_null)
                break;
--- a/src/Backups/RestoreSettings.cpp
+++ b/src/Backups/RestoreSettings.cpp
@ -8,6 +8,7 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include <Common/FieldVisitorConvertToNumber.h>
 #include <Backups/SettingsFieldOptionalUUID.h>
+#include <Backups/SettingsFieldOptionalString.h>


 namespace DB
@ -164,6 +165,7 @@ namespace
    M(Bool, allow_s3_native_copy) \
    M(Bool, internal) \
    M(String, host_id) \
+    M(OptionalString, storage_policy) \
    M(OptionalUUID, restore_uuid)


--- a/src/Backups/RestoreSettings.h
+++ b/src/Backups/RestoreSettings.h
@ -117,6 +117,9 @@ struct RestoreSettings
    /// The current host's ID in the format 'escaped_host_name:port'.
    String host_id;

+    /// Alternative storage policy that may be specified in the SETTINGS clause of RESTORE queries
+    std::optional<String> storage_policy;
+
    /// Internal, should not be specified by user.
    /// Cluster's hosts' IDs in the format 'escaped_host_name:port' for all shards and replicas in a cluster specified in BACKUP ON CLUSTER.
    std::vector<Strings> cluster_host_ids;
--- a/src/Backups/RestorerFromBackup.cpp
+++ b/src/Backups/RestorerFromBackup.cpp
@ -322,6 +322,7 @@ void RestorerFromBackup::findTableInBackup(const QualifiedTableName & table_name
    read_buffer.reset();
    ParserCreateQuery create_parser;
    ASTPtr create_table_query = parseQuery(create_parser, create_query_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
+    applyCustomStoragePolicy(create_table_query);
    renameDatabaseAndTableNameInCreateQuery(create_table_query, renaming_map, context->getGlobalContext());

    QualifiedTableName table_name = renaming_map.getNewTableName(table_name_in_backup);
@ -625,6 +626,24 @@ void RestorerFromBackup::checkDatabase(const String & database_name)
    }
 }

+void RestorerFromBackup::applyCustomStoragePolicy(ASTPtr query_ptr)
+{
+    constexpr auto setting_name = "storage_policy";
+    if (query_ptr && restore_settings.storage_policy.has_value())
+    {
+        ASTStorage * storage = query_ptr->as<ASTCreateQuery &>().storage;
+        if (storage && storage->settings)
+        {
+            if (restore_settings.storage_policy.value().empty())
+                /// it has been set to "" deliberately, so the source storage policy is erased
+                storage->settings->changes.removeSetting(setting_name);
+            else
+                /// it has been set to a custom value, so it either overwrites the existing value or is added as a new one
+                storage->settings->changes.setSetting(setting_name, restore_settings.storage_policy.value());
+        }
+    }
+}
+
 void RestorerFromBackup::removeUnresolvedDependencies()
 {
    auto need_exclude_dependency = [this](const StorageID & table_id)
--- a/src/Backups/RestorerFromBackup.h
+++ b/src/Backups/RestorerFromBackup.h
@ -95,6 +95,8 @@ private:
    void createDatabase(const String & database_name) const;
    void checkDatabase(const String & database_name);

+    void applyCustomStoragePolicy(ASTPtr query_ptr);
+
    void removeUnresolvedDependencies();
    void createTables();
    void createTable(const QualifiedTableName & table_name);
--- a/src/Backups/SettingsFieldOptionalString.cpp
+++ b/src/Backups/SettingsFieldOptionalString.cpp
@ -0,0 +1,29 @@
+#include <Backups/SettingsFieldOptionalString.h>
+#include <Common/ErrorCodes.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_PARSE_BACKUP_SETTINGS;
+}
+
+SettingFieldOptionalString::SettingFieldOptionalString(const Field & field)
+{
+    if (field.getType() == Field::Types::Null)
+    {
+        value = std::nullopt;
+        return;
+    }
+
+    if (field.getType() == Field::Types::String)
+    {
+        value = field.get<const String &>();
+        return;
+    }
+
+    throw Exception(ErrorCodes::CANNOT_PARSE_BACKUP_SETTINGS, "Cannot get string from {}", field);
+}
+
+}
--- a/src/Backups/SettingsFieldOptionalString.h
+++ b/src/Backups/SettingsFieldOptionalString.h
@ -0,0 +1,20 @@
+#pragma once
+
+#include <optional>
+#include <Core/SettingsFields.h>
+
+namespace DB
+{
+
+struct SettingFieldOptionalString
+{
+    std::optional<String> value;
+
+    explicit SettingFieldOptionalString(const std::optional<String> & value_) : value(value_) {}
+
+    explicit SettingFieldOptionalString(const Field & field);
+
+    explicit operator Field() const { return Field(value ? toString(*value) : ""); }
+};
+
+}
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -576,6 +576,10 @@ if (TARGET ch_contrib::bzip2)
    target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::bzip2)
 endif()

+if (TARGET ch_contrib::libarchive)
+    target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::libarchive)
+endif()
+
 if (TARGET ch_contrib::minizip)
    target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::minizip)
 endif ()
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@ -1436,6 +1436,7 @@ void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_des
            ConstraintsDescription{},
            String{},
            {},
+            String{},
        };
        StoragePtr storage = std::make_shared<StorageFile>(in_file, global_context->getUserFilesPath(), args);
        storage->startup();
--- a/src/Common/AllocationTrace.h
+++ b/src/Common/AllocationTrace.h
@ -0,0 +1,34 @@
+#pragma once
+#include <cstddef>
+#include <base/defines.h>
+
+/// This is a structure which is returned by MemoryTracker.
+/// Methods onAlloc/onFree should be called after actual memory allocation if it succeed.
+/// For now, it will only collect allocation trace with sample_probability.
+struct AllocationTrace
+{
+    AllocationTrace() = default;
+    explicit AllocationTrace(double sample_probability_) : sample_probability(sample_probability_) {}
+
+    ALWAYS_INLINE void onAlloc(void * ptr, size_t size) const
+    {
+        if (likely(sample_probability <= 0))
+            return;
+
+        onAllocImpl(ptr, size);
+    }
+
+    ALWAYS_INLINE void onFree(void * ptr, size_t size) const
+    {
+        if (likely(sample_probability <= 0))
+            return;
+
+        onFreeImpl(ptr, size);
+    }
+
+private:
+    double sample_probability = 0;
+
+    void onAllocImpl(void * ptr, size_t size) const;
+    void onFreeImpl(void * ptr, size_t size) const;
+};
--- a/src/Common/Allocator.h
+++ b/src/Common/Allocator.h
@ -99,8 +99,10 @@ public:
    void * alloc(size_t size, size_t alignment = 0)
    {
        checkSize(size);
-        CurrentMemoryTracker::alloc(size);
-        return allocNoTrack(size, alignment);
+        auto trace = CurrentMemoryTracker::alloc(size);
+        void * ptr = allocNoTrack(size, alignment);
+        trace.onAlloc(ptr, size);
+        return ptr;
    }

    /// Free memory range.
@ -110,7 +112,8 @@ public:
        {
            checkSize(size);
            freeNoTrack(buf, size);
-            CurrentMemoryTracker::free(size);
+            auto trace = CurrentMemoryTracker::free(size);
+            trace.onFree(buf, size);
        }
        catch (...)
        {
@ -136,13 +139,17 @@ public:
                 && alignment <= MALLOC_MIN_ALIGNMENT)
        {
            /// Resize malloc'd memory region with no special alignment requirement.
-            CurrentMemoryTracker::realloc(old_size, new_size);
+            auto trace_free = CurrentMemoryTracker::free(old_size);
+            auto trace_alloc = CurrentMemoryTracker::alloc(new_size);
+            trace_free.onFree(buf, old_size);

            void * new_buf = ::realloc(buf, new_size);
            if (nullptr == new_buf)
                DB::throwFromErrno(fmt::format("Allocator: Cannot realloc from {} to {}.", ReadableSize(old_size), ReadableSize(new_size)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);

            buf = new_buf;
+            trace_alloc.onAlloc(buf, new_size);
+
            if constexpr (clear_memory)
                if (new_size > old_size)
                    memset(reinterpret_cast<char *>(buf) + old_size, 0, new_size - old_size);
@ -150,7 +157,9 @@ public:
        else if (old_size >= MMAP_THRESHOLD && new_size >= MMAP_THRESHOLD)
        {
            /// Resize mmap'd memory region.
-            CurrentMemoryTracker::realloc(old_size, new_size);
+            auto trace_free = CurrentMemoryTracker::free(old_size);
+            auto trace_alloc = CurrentMemoryTracker::alloc(new_size);
+            trace_free.onFree(buf, old_size);

            // On apple and freebsd self-implemented mremap used (common/mremap.h)
            buf = clickhouse_mremap(buf, old_size, new_size, MREMAP_MAYMOVE,
@ -160,13 +169,17 @@ public:
                    ReadableSize(old_size), ReadableSize(new_size)), DB::ErrorCodes::CANNOT_MREMAP);

            /// No need for zero-fill, because mmap guarantees it.
+            trace_alloc.onAlloc(buf, new_size);
        }
        else if (new_size < MMAP_THRESHOLD)
        {
            /// Small allocs that requires a copy. Assume there's enough memory in system. Call CurrentMemoryTracker once.
-            CurrentMemoryTracker::realloc(old_size, new_size);
+            auto trace_free = CurrentMemoryTracker::free(old_size);
+            auto trace_alloc = CurrentMemoryTracker::alloc(new_size);
+            trace_free.onFree(buf, old_size);

            void * new_buf = allocNoTrack(new_size, alignment);
+            trace_alloc.onAlloc(buf, new_size);
            memcpy(new_buf, buf, std::min(old_size, new_size));
            freeNoTrack(buf, old_size);
            buf = new_buf;
--- a/src/Common/AllocatorWithMemoryTracking.h
+++ b/src/Common/AllocatorWithMemoryTracking.h
@ -30,21 +30,24 @@ struct AllocatorWithMemoryTracking
            throw std::bad_alloc();

        size_t bytes = n * sizeof(T);
-        CurrentMemoryTracker::alloc(bytes);
+        auto trace = CurrentMemoryTracker::alloc(bytes);

        T * p = static_cast<T *>(malloc(bytes));
        if (!p)
            throw std::bad_alloc();

+        trace.onAlloc(p, bytes);
+
        return p;
    }

    void deallocate(T * p, size_t n) noexcept
    {
-        free(p);
-
        size_t bytes = n * sizeof(T);
-        CurrentMemoryTracker::free(bytes);
+
+        free(p);
+        auto trace = CurrentMemoryTracker::free(bytes);
+        trace.onFree(p, bytes);
    }
 };

--- a/src/Common/CurrentMemoryTracker.cpp
+++ b/src/Common/CurrentMemoryTracker.cpp
@ -37,7 +37,7 @@ MemoryTracker * getMemoryTracker()

 using DB::current_thread;

-void CurrentMemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded)
+AllocationTrace CurrentMemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded)
 {
 #ifdef MEMORY_TRACKER_DEBUG_CHECKS
    if (unlikely(memory_tracker_always_throw_logical_error_on_allocation))
@ -55,8 +55,9 @@ void CurrentMemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded)

            if (will_be > current_thread->untracked_memory_limit)
            {
-                memory_tracker->allocImpl(will_be, throw_if_memory_exceeded);
+                auto res = memory_tracker->allocImpl(will_be, throw_if_memory_exceeded);
                current_thread->untracked_memory = 0;
+                return res;
            }
            else
            {
@ -68,36 +69,34 @@ void CurrentMemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded)
        /// total_memory_tracker only, ignore untracked_memory
        else
        {
-            memory_tracker->allocImpl(size, throw_if_memory_exceeded);
+            return memory_tracker->allocImpl(size, throw_if_memory_exceeded);
        }
+
+        return AllocationTrace(memory_tracker->getSampleProbability(size));
    }
+
+    return AllocationTrace(0);
 }

 void CurrentMemoryTracker::check()
 {
    if (auto * memory_tracker = getMemoryTracker())
-        memory_tracker->allocImpl(0, true);
+        std::ignore = memory_tracker->allocImpl(0, true);
 }

-void CurrentMemoryTracker::alloc(Int64 size)
+AllocationTrace CurrentMemoryTracker::alloc(Int64 size)
 {
    bool throw_if_memory_exceeded = true;
-    allocImpl(size, throw_if_memory_exceeded);
+    return allocImpl(size, throw_if_memory_exceeded);
 }

-void CurrentMemoryTracker::allocNoThrow(Int64 size)
+AllocationTrace CurrentMemoryTracker::allocNoThrow(Int64 size)
 {
    bool throw_if_memory_exceeded = false;
-    allocImpl(size, throw_if_memory_exceeded);
+    return allocImpl(size, throw_if_memory_exceeded);
 }

-void CurrentMemoryTracker::realloc(Int64 old_size, Int64 new_size)
-{
-    Int64 addition = new_size - old_size;
-    addition > 0 ? alloc(addition) : free(-addition);
-}
-
-void CurrentMemoryTracker::free(Int64 size)
+AllocationTrace CurrentMemoryTracker::free(Int64 size)
 {
    if (auto * memory_tracker = getMemoryTracker())
    {
@ -106,16 +105,21 @@ void CurrentMemoryTracker::free(Int64 size)
            current_thread->untracked_memory -= size;
            if (current_thread->untracked_memory < -current_thread->untracked_memory_limit)
            {
-                memory_tracker->free(-current_thread->untracked_memory);
+                Int64 untracked_memory = current_thread->untracked_memory;
                current_thread->untracked_memory = 0;
+                return memory_tracker->free(-untracked_memory);
            }
        }
        /// total_memory_tracker only, ignore untracked_memory
        else
        {
-            memory_tracker->free(size);
+            return memory_tracker->free(size);
        }
+
+        return AllocationTrace(memory_tracker->getSampleProbability(size));
    }
+
+    return AllocationTrace(0);
 }

 void CurrentMemoryTracker::injectFault()
--- a/src/Common/CurrentMemoryTracker.h
+++ b/src/Common/CurrentMemoryTracker.h
@ -1,22 +1,22 @@
 #pragma once

 #include <base/types.h>
+#include <Common/AllocationTrace.h>

 /// Convenience methods, that use current thread's memory_tracker if it is available.
 struct CurrentMemoryTracker
 {
    /// Call the following functions before calling of corresponding operations with memory allocators.
-    static void alloc(Int64 size);
-    static void allocNoThrow(Int64 size);
-    static void realloc(Int64 old_size, Int64 new_size);
+    [[nodiscard]] static AllocationTrace alloc(Int64 size);
+    [[nodiscard]] static AllocationTrace allocNoThrow(Int64 size);

    /// This function should be called after memory deallocation.
-    static void free(Int64 size);
+    [[nodiscard]] static AllocationTrace free(Int64 size);
    static void check();

    /// Throws MEMORY_LIMIT_EXCEEDED (if it's allowed to throw exceptions)
    static void injectFault();

 private:
-    static void allocImpl(Int64 size, bool throw_if_memory_exceeded);
+    [[nodiscard]] static AllocationTrace allocImpl(Int64 size, bool throw_if_memory_exceeded);
 };
--- a/src/Common/Exception.h
+++ b/src/Common/Exception.h
@ -81,9 +81,9 @@ public:
    }

    /// Message must be a compile-time constant
-    template<typename T, typename = std::enable_if_t<std::is_convertible_v<T, String>>>
-    Exception(int code, T && message)
-        : Exception(message, code)
+    template <typename T>
+    requires std::is_convertible_v<T, String>
+    Exception(int code, T && message) : Exception(message, code)
    {
        capture_thread_frame_pointers = thread_frame_pointers;
        message_format_string = tryGetStaticFormatString(message);
--- a/src/Common/FiberStack.h
+++ b/src/Common/FiberStack.h
@ -57,7 +57,8 @@ public:
        }

        /// Do not count guard page in memory usage.
-        CurrentMemoryTracker::alloc(num_pages * page_size);
+        auto trace = CurrentMemoryTracker::alloc(num_pages * page_size);
+        trace.onAlloc(vp, num_pages * page_size);

        boost::context::stack_context sctx;
        sctx.size = num_bytes;
@ -77,6 +78,7 @@ public:
        ::munmap(vp, sctx.size);

        /// Do not count guard page in memory usage.
-        CurrentMemoryTracker::free(sctx.size - page_size);
+        auto trace = CurrentMemoryTracker::free(sctx.size - page_size);
+        trace.onFree(vp, sctx.size - page_size);
    }
 };
--- a/src/Common/IntervalTree.h
+++ b/src/Common/IntervalTree.h
@ -3,8 +3,8 @@
 #include <base/defines.h>
 #include <base/sort.h>

-#include <vector>
 #include <utility>
+#include <vector>


 namespace DB
@ -119,7 +119,8 @@ public:
        return true;
    }

-    template <typename TValue = Value, std::enable_if_t<!std::is_same_v<TValue, IntervalTreeVoidValue>, bool> = true, typename... Args>
+    template <typename TValue = Value, bool = true, typename... Args>
+    requires(!std::is_same_v<TValue, IntervalTreeVoidValue>)
    ALWAYS_INLINE bool emplace(Interval interval, Args &&... args)
    {
        assert(!tree_is_built);
--- a/src/Common/MemoryTracker.cpp
+++ b/src/Common/MemoryTracker.cpp
@ -1,6 +1,7 @@
 #include "MemoryTracker.h"

 #include <IO/WriteHelpers.h>
+#include <Common/HashTable/Hash.h>
 #include <Common/VariableContext.h>
 #include <Common/TraceSender.h>
 #include <Common/Exception.h>
@ -82,6 +83,29 @@ inline std::string_view toDescription(OvercommitResult result)
    }
 }

+bool shouldTrackAllocation(DB::Float64 probability, void * ptr)
+{
+    return intHash64(uintptr_t(ptr)) < std::numeric_limits<uint64_t>::max() * probability;
+}
+
+}
+
+void AllocationTrace::onAllocImpl(void * ptr, size_t size) const
+{
+    if (sample_probability < 1 && !shouldTrackAllocation(sample_probability, ptr))
+        return;
+
+    MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
+    DB::TraceSender::send(DB::TraceType::MemorySample, StackTrace(), {.size = Int64(size), .ptr = ptr});
+}
+
+void AllocationTrace::onFreeImpl(void * ptr, size_t size) const
+{
+    if (sample_probability < 1 && !shouldTrackAllocation(sample_probability, ptr))
+        return;
+
+    MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
+    DB::TraceSender::send(DB::TraceType::MemorySample, StackTrace(), {.size = -Int64(size), .ptr = ptr});
 }

 namespace ProfileEvents
@ -180,11 +204,17 @@ void MemoryTracker::debugLogBigAllocationWithoutCheck(Int64 size [[maybe_unused]
 #endif
 }

-void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryTracker * query_tracker)
+AllocationTrace MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryTracker * query_tracker, double _sample_probability)
 {
    if (size < 0)
        throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Negative size ({}) is passed to MemoryTracker. It is a bug.", size);

+    if (_sample_probability < 0)
+        _sample_probability = sample_probability;
+
+    if (!isSizeOkForSampling(size))
+        _sample_probability = 0;
+
    if (MemoryTrackerBlockerInThread::isBlocked(level))
    {
        if (level == VariableContext::Global)
@ -199,9 +229,12 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT

        /// Since the MemoryTrackerBlockerInThread should respect the level, we should go to the next parent.
        if (auto * loaded_next = parent.load(std::memory_order_relaxed))
-            loaded_next->allocImpl(size, throw_if_memory_exceeded,
-                level == VariableContext::Process ? this : query_tracker);
-        return;
+        {
+            MemoryTracker * tracker = level == VariableContext::Process ? this : query_tracker;
+            return loaded_next->allocImpl(size, throw_if_memory_exceeded, tracker, _sample_probability);
+        }
+
+        return AllocationTrace(_sample_probability);
    }

    /** Using memory_order_relaxed means that if allocations are done simultaneously,
@ -228,14 +261,6 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT
        allocation_traced = true;
    }

-    std::bernoulli_distribution sample(sample_probability);
-    if (unlikely(sample_probability > 0.0 && isSizeOkForSampling(size) && sample(thread_local_rng)))
-    {
-        MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
-        DB::TraceSender::send(DB::TraceType::MemorySample, StackTrace(), {.size = size});
-        allocation_traced = true;
-    }
-
    std::bernoulli_distribution fault(fault_probability);
    if (unlikely(fault_probability > 0.0 && fault(thread_local_rng)))
    {
@ -364,16 +389,20 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT
    }

    if (auto * loaded_next = parent.load(std::memory_order_relaxed))
-        loaded_next->allocImpl(size, throw_if_memory_exceeded,
-            level == VariableContext::Process ? this : query_tracker);
+    {
+        MemoryTracker * tracker = level == VariableContext::Process ? this : query_tracker;
+        return loaded_next->allocImpl(size, throw_if_memory_exceeded, tracker, _sample_probability);
+    }
+
+    return AllocationTrace(_sample_probability);
 }

 void MemoryTracker::adjustWithUntrackedMemory(Int64 untracked_memory)
 {
    if (untracked_memory > 0)
-        allocImpl(untracked_memory, /*throw_if_memory_exceeded*/ false);
+        std::ignore = allocImpl(untracked_memory, /*throw_if_memory_exceeded*/ false);
    else
-        free(-untracked_memory);
+        std::ignore = free(-untracked_memory);
 }

 bool MemoryTracker::updatePeak(Int64 will_be, bool log_memory_usage)
@ -392,9 +421,14 @@ bool MemoryTracker::updatePeak(Int64 will_be, bool log_memory_usage)
    return false;
 }

-
-void MemoryTracker::free(Int64 size)
+AllocationTrace MemoryTracker::free(Int64 size, double _sample_probability)
 {
+    if (_sample_probability < 0)
+        _sample_probability = sample_probability;
+
+    if (!isSizeOkForSampling(size))
+        _sample_probability = 0;
+
    if (MemoryTrackerBlockerInThread::isBlocked(level))
    {
        if (level == VariableContext::Global)
@ -408,15 +442,9 @@ void MemoryTracker::free(Int64 size)

        /// Since the MemoryTrackerBlockerInThread should respect the level, we should go to the next parent.
        if (auto * loaded_next = parent.load(std::memory_order_relaxed))
-            loaded_next->free(size);
-        return;
-    }
+            return loaded_next->free(size, _sample_probability);

-    std::bernoulli_distribution sample(sample_probability);
-    if (unlikely(sample_probability > 0.0 && isSizeOkForSampling(size) && sample(thread_local_rng)))
-    {
-        MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
-        DB::TraceSender::send(DB::TraceType::MemorySample, StackTrace(), {.size = -size});
+        return AllocationTrace(_sample_probability);
    }

    Int64 accounted_size = size;
@ -444,12 +472,15 @@ void MemoryTracker::free(Int64 size)
    if (auto * overcommit_tracker_ptr = overcommit_tracker.load(std::memory_order_relaxed))
        overcommit_tracker_ptr->tryContinueQueryExecutionAfterFree(accounted_size);

-    if (auto * loaded_next = parent.load(std::memory_order_relaxed))
-        loaded_next->free(size);
-
+    /// free should never throw, we can update metric early.
    auto metric_loaded = metric.load(std::memory_order_relaxed);
    if (metric_loaded != CurrentMetrics::end())
        CurrentMetrics::sub(metric_loaded, accounted_size);
+
+    if (auto * loaded_next = parent.load(std::memory_order_relaxed))
+        return loaded_next->free(size, _sample_probability);
+
+    return AllocationTrace(_sample_probability);
 }


@ -534,6 +565,21 @@ void MemoryTracker::setOrRaiseProfilerLimit(Int64 value)
        ;
 }

+double MemoryTracker::getSampleProbability(UInt64 size)
+{
+    if (sample_probability >= 0)
+    {
+        if (!isSizeOkForSampling(size))
+            return 0;
+        return sample_probability;
+    }
+
+    if (auto * loaded_next = parent.load(std::memory_order_relaxed))
+        return loaded_next->getSampleProbability(size);
+
+    return 0;
+}
+
 bool MemoryTracker::isSizeOkForSampling(UInt64 size) const
 {
    /// We can avoid comparison min_allocation_size_bytes with zero, because we cannot have 0 bytes allocation/deallocation
--- a/src/Common/MemoryTracker.h
+++ b/src/Common/MemoryTracker.h
@ -2,9 +2,11 @@

 #include <atomic>
 #include <chrono>
+#include <optional>
 #include <base/types.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/VariableContext.h>
+#include <Common/AllocationTrace.h>

 #if !defined(NDEBUG)
 #define MEMORY_TRACKER_DEBUG_CHECKS
@ -65,7 +67,7 @@ private:
    double fault_probability = 0;

    /// To randomly sample allocations and deallocations in trace_log.
-    double sample_probability = 0;
+    double sample_probability = -1;

    /// Randomly sample allocations only larger or equal to this size
    UInt64 min_allocation_size_bytes = 0;
@ -98,8 +100,8 @@ private:

    /// allocImpl(...) and free(...) should not be used directly
    friend struct CurrentMemoryTracker;
-    void allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryTracker * query_tracker = nullptr);
-    void free(Int64 size);
+    [[nodiscard]] AllocationTrace allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryTracker * query_tracker = nullptr, double _sample_probability = -1.0);
+    [[nodiscard]] AllocationTrace free(Int64 size, double _sample_probability = -1.0);
 public:

    static constexpr auto USAGE_EVENT_NAME = "MemoryTrackerUsage";
@ -174,6 +176,8 @@ public:
        sample_probability = value;
    }

+    double getSampleProbability(UInt64 size);
+
    void setSampleMinAllocationSize(UInt64 value)
    {
        min_allocation_size_bytes = value;
--- a/src/Common/MemoryTrackerBlockerInThread.h
+++ b/src/Common/MemoryTrackerBlockerInThread.h
@ -28,4 +28,5 @@ public:
    }

    friend class MemoryTracker;
+    friend struct AllocationTrace;
 };
--- a/src/Common/NetException.h
+++ b/src/Common/NetException.h
@ -9,7 +9,8 @@ namespace DB
 class NetException : public Exception
 {
 public:
-    template<typename T, typename = std::enable_if_t<std::is_convertible_v<T, String>>>
+    template <typename T>
+    requires std::is_convertible_v<T, String>
    NetException(int code, T && message) : Exception(std::forward<T>(message), code)
    {
        message_format_string = tryGetStaticFormatString(message);
--- a/src/Common/TraceSender.cpp
+++ b/src/Common/TraceSender.cpp
@ -33,6 +33,7 @@ void TraceSender::send(TraceType trace_type, const StackTrace & stack_trace, Ext
        + sizeof(TraceType)                  /// trace type
        + sizeof(UInt64)                     /// thread_id
        + sizeof(Int64)                      /// size
+        + sizeof(void *)                     /// ptr
        + sizeof(ProfileEvents::Event)       /// event
        + sizeof(ProfileEvents::Count);      /// increment

@ -74,6 +75,7 @@ void TraceSender::send(TraceType trace_type, const StackTrace & stack_trace, Ext
    writePODBinary(trace_type, out);
    writePODBinary(thread_id, out);
    writePODBinary(extras.size, out);
+    writePODBinary(UInt64(extras.ptr), out);
    writePODBinary(extras.event, out);
    writePODBinary(extras.increment, out);

--- a/src/Common/TraceSender.h
+++ b/src/Common/TraceSender.h
@ -28,8 +28,9 @@ class TraceSender
 public:
    struct Extras
    {
-        /// size - for memory tracing is the amount of memory allocated; for other trace types it is 0.
+        /// size, ptr - for memory tracing is the amount of memory allocated; for other trace types it is 0.
        Int64 size{};
+        void * ptr = nullptr;
        /// Event type and increment for 'ProfileEvent' trace type; for other trace types defaults.
        ProfileEvents::Event event{ProfileEvents::end()};
        ProfileEvents::Count increment{};
--- a/src/Common/clickhouse_malloc.cpp
+++ b/src/Common/clickhouse_malloc.cpp
@ -9,7 +9,11 @@ extern "C" void * clickhouse_malloc(size_t size)
 {
    void * res = malloc(size);
    if (res)
-        Memory::trackMemory(size);
+    {
+        AllocationTrace trace;
+        size_t actual_size = Memory::trackMemory(size, trace);
+        trace.onAlloc(res, actual_size);
+    }
    return res;
 }

@ -17,17 +21,29 @@ extern "C" void * clickhouse_calloc(size_t number_of_members, size_t size)
 {
    void * res = calloc(number_of_members, size);
    if (res)
-        Memory::trackMemory(number_of_members * size);
+    {
+        AllocationTrace trace;
+        size_t actual_size = Memory::trackMemory(number_of_members * size, trace);
+        trace.onAlloc(res, actual_size);
+    }
    return res;
 }

 extern "C" void * clickhouse_realloc(void * ptr, size_t size)
 {
    if (ptr)
-        Memory::untrackMemory(ptr);
+    {
+        AllocationTrace trace;
+        size_t actual_size = Memory::untrackMemory(ptr, trace);
+        trace.onFree(ptr, actual_size);
+    }
    void * res = realloc(ptr, size);
    if (res)
-        Memory::trackMemory(size);
+    {
+        AllocationTrace trace;
+        size_t actual_size = Memory::trackMemory(size, trace);
+        trace.onAlloc(res, actual_size);
+    }
    return res;
 }

@ -42,7 +58,9 @@ extern "C" void * clickhouse_reallocarray(void * ptr, size_t number_of_members,

 extern "C" void clickhouse_free(void * ptr)
 {
-    Memory::untrackMemory(ptr);
+    AllocationTrace trace;
+    size_t actual_size = Memory::untrackMemory(ptr, trace);
+    trace.onFree(ptr, actual_size);
    free(ptr);
 }

@ -50,6 +68,10 @@ extern "C" int clickhouse_posix_memalign(void ** memptr, size_t alignment, size_
 {
    int res = posix_memalign(memptr, alignment, size);
    if (res == 0)
-        Memory::trackMemory(size);
+    {
+        AllocationTrace trace;
+        size_t actual_size = Memory::trackMemory(size, trace);
+        trace.onAlloc(*memptr, actual_size);
+    }
    return res;
 }
--- a/src/Common/config.h.in
+++ b/src/Common/config.h.in
@ -59,6 +59,7 @@
 #cmakedefine01 USE_ULID
 #cmakedefine01 FIU_ENABLE
 #cmakedefine01 USE_BCRYPT
+#cmakedefine01 USE_LIBARCHIVE

 /// This is needed for .incbin in assembly. For some reason, include paths don't work there in presence of LTO.
 /// That's why we use absolute paths.
--- a/src/Common/memory.h
+++ b/src/Common/memory.h
@ -169,23 +169,26 @@ inline ALWAYS_INLINE size_t getActualAllocationSize(size_t size, TAlign... align

 template <std::same_as<std::align_val_t>... TAlign>
 requires DB::OptionalArgument<TAlign...>
-inline ALWAYS_INLINE void trackMemory(std::size_t size, TAlign... align)
+inline ALWAYS_INLINE size_t trackMemory(std::size_t size, AllocationTrace & trace, TAlign... align)
 {
    std::size_t actual_size = getActualAllocationSize(size, align...);
-    CurrentMemoryTracker::allocNoThrow(actual_size);
+    trace = CurrentMemoryTracker::allocNoThrow(actual_size);
+    return actual_size;
 }

 template <std::same_as<std::align_val_t>... TAlign>
 requires DB::OptionalArgument<TAlign...>
-inline ALWAYS_INLINE void untrackMemory(void * ptr [[maybe_unused]], std::size_t size [[maybe_unused]] = 0, TAlign... align [[maybe_unused]]) noexcept
+inline ALWAYS_INLINE size_t untrackMemory(void * ptr [[maybe_unused]], AllocationTrace & trace, std::size_t size [[maybe_unused]] = 0, TAlign... align [[maybe_unused]]) noexcept
 {
+    std::size_t actual_size = 0;
+
 #if USE_GWP_ASAN
    if (unlikely(GuardedAlloc.pointerIsMine(ptr)))
    {
        if (!size)
            size = GuardedAlloc.getSize(ptr);
-        CurrentMemoryTracker::free(size);
-        return;
+        trace = CurrentMemoryTracker::free(size);
+        return size;
    }
 #endif

@ -197,23 +200,26 @@ inline ALWAYS_INLINE void untrackMemory(void * ptr [[maybe_unused]], std::size_t
        if (likely(ptr != nullptr))
        {
            if constexpr (sizeof...(TAlign) == 1)
-                CurrentMemoryTracker::free(sallocx(ptr, MALLOCX_ALIGN(alignToSizeT(align...))));
+                actual_size = sallocx(ptr, MALLOCX_ALIGN(alignToSizeT(align...)));
            else
-                CurrentMemoryTracker::free(sallocx(ptr, 0));
+                actual_size = sallocx(ptr, 0);
        }
 #else
        if (size)
-            CurrentMemoryTracker::free(size);
+            actual_size = size;
 #    if defined(_GNU_SOURCE)
        /// It's innaccurate resource free for sanitizers. malloc_usable_size() result is greater or equal to allocated size.
        else
-            CurrentMemoryTracker::free(malloc_usable_size(ptr));
+            actual_size = malloc_usable_size(ptr);
 #    endif
 #endif
+        trace = CurrentMemoryTracker::free(actual_size);
    }
    catch (...)
    {
    }
+
+    return actual_size;
 }

 }
--- a/src/Common/new_delete.cpp
+++ b/src/Common/new_delete.cpp
@ -71,50 +71,74 @@ static struct InitGwpAsan

 void * operator new(std::size_t size)
 {
-    Memory::trackMemory(size);
-    return Memory::newImpl(size);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::trackMemory(size, trace);
+    void * ptr = Memory::newImpl(size);
+    trace.onAlloc(ptr, actual_size);
+    return ptr;
 }

 void * operator new(std::size_t size, std::align_val_t align)
 {
-    Memory::trackMemory(size, align);
-    return Memory::newImpl(size, align);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::trackMemory(size, trace, align);
+    void * ptr = Memory::newImpl(size, align);
+    trace.onAlloc(ptr, actual_size);
+    return ptr;
 }

 void * operator new[](std::size_t size)
 {
-    Memory::trackMemory(size);
-    return Memory::newImpl(size);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::trackMemory(size, trace);
+    void * ptr =  Memory::newImpl(size);
+    trace.onAlloc(ptr, actual_size);
+    return ptr;
 }

 void * operator new[](std::size_t size, std::align_val_t align)
 {
-    Memory::trackMemory(size, align);
-    return Memory::newImpl(size, align);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::trackMemory(size, trace, align);
+    void * ptr = Memory::newImpl(size, align);
+    trace.onAlloc(ptr, actual_size);
+    return ptr;
 }

 void * operator new(std::size_t size, const std::nothrow_t &) noexcept
 {
-    Memory::trackMemory(size);
-    return Memory::newNoExept(size);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::trackMemory(size, trace);
+    void * ptr = Memory::newNoExept(size);
+    trace.onAlloc(ptr, actual_size);
+    return ptr;
 }

 void * operator new[](std::size_t size, const std::nothrow_t &) noexcept
 {
-    Memory::trackMemory(size);
-    return Memory::newNoExept(size);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::trackMemory(size, trace);
+    void * ptr = Memory::newNoExept(size);
+    trace.onAlloc(ptr, actual_size);
+    return ptr;
 }

 void * operator new(std::size_t size, std::align_val_t align, const std::nothrow_t &) noexcept
 {
-    Memory::trackMemory(size, align);
-    return Memory::newNoExept(size, align);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::trackMemory(size, trace, align);
+    void * ptr = Memory::newNoExept(size, align);
+    trace.onAlloc(ptr, actual_size);
+    return ptr;
 }

 void * operator new[](std::size_t size, std::align_val_t align, const std::nothrow_t &) noexcept
 {
-    Memory::trackMemory(size, align);
-    return Memory::newNoExept(size, align);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::trackMemory(size, trace, align);
+    void * ptr = Memory::newNoExept(size, align);
+    trace.onAlloc(ptr, actual_size);
+    return ptr;
 }

 /// delete
@ -130,48 +154,64 @@ void * operator new[](std::size_t size, std::align_val_t align, const std::nothr

 void operator delete(void * ptr) noexcept
 {
-    Memory::untrackMemory(ptr);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::untrackMemory(ptr, trace);
+    trace.onFree(ptr, actual_size);
    Memory::deleteImpl(ptr);
 }

 void operator delete(void * ptr, std::align_val_t align) noexcept
 {
-    Memory::untrackMemory(ptr, 0, align);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::untrackMemory(ptr, trace, 0, align);
+    trace.onFree(ptr, actual_size);
    Memory::deleteImpl(ptr);
 }

 void operator delete[](void * ptr) noexcept
 {
-    Memory::untrackMemory(ptr);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::untrackMemory(ptr, trace);
+    trace.onFree(ptr, actual_size);
    Memory::deleteImpl(ptr);
 }

 void operator delete[](void * ptr, std::align_val_t align) noexcept
 {
-    Memory::untrackMemory(ptr, 0, align);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::untrackMemory(ptr, trace, 0, align);
+    trace.onFree(ptr, actual_size);
    Memory::deleteImpl(ptr);
 }

 void operator delete(void * ptr, std::size_t size) noexcept
 {
-    Memory::untrackMemory(ptr, size);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::untrackMemory(ptr, trace, size);
+    trace.onFree(ptr, actual_size);
    Memory::deleteSized(ptr, size);
 }

 void operator delete(void * ptr, std::size_t size, std::align_val_t align) noexcept
 {
-    Memory::untrackMemory(ptr, size, align);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::untrackMemory(ptr, trace, size, align);
+    trace.onFree(ptr, actual_size);
    Memory::deleteSized(ptr, size, align);
 }

 void operator delete[](void * ptr, std::size_t size) noexcept
 {
-    Memory::untrackMemory(ptr, size);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::untrackMemory(ptr, trace, size);
+    trace.onFree(ptr, actual_size);
    Memory::deleteSized(ptr, size);
 }

 void operator delete[](void * ptr, std::size_t size, std::align_val_t align) noexcept
 {
-    Memory::untrackMemory(ptr, size, align);
+    AllocationTrace trace;
+    std::size_t actual_size = Memory::untrackMemory(ptr, trace, size, align);
+    trace.onFree(ptr, actual_size);
    Memory::deleteSized(ptr, size, align);
 }
--- a/src/Common/setThreadName.cpp
+++ b/src/Common/setThreadName.cpp
@ -43,7 +43,7 @@ void setThreadName(const char * name)
 #else
    if (0 != prctl(PR_SET_NAME, name, 0, 0, 0))
 #endif
-        if (errno != ENOSYS)    /// It's ok if the syscall is unsupported in some environments.
+        if (errno != ENOSYS && errno != EPERM)    /// It's ok if the syscall is unsupported or not allowed in some environments.
            DB::throwFromErrno("Cannot set thread name with prctl(PR_SET_NAME, ...)", DB::ErrorCodes::PTHREAD_ERROR);

    memcpy(thread_name, name, std::min<size_t>(1 + strlen(name), THREAD_NAME_SIZE - 1));
@ -63,7 +63,7 @@ const char * getThreadName()
 //        throw DB::Exception(DB::ErrorCodes::PTHREAD_ERROR, "Cannot get thread name with pthread_get_name_np()");
 #else
    if (0 != prctl(PR_GET_NAME, thread_name, 0, 0, 0))
-        if (errno != ENOSYS)    /// It's ok if the syscall is unsupported in some environments.
+        if (errno != ENOSYS && errno != EPERM)    /// It's ok if the syscall is unsupported or not allowed in some environments.
            DB::throwFromErrno("Cannot get thread name with prctl(PR_GET_NAME)", DB::ErrorCodes::PTHREAD_ERROR);
 #endif

--- a/src/Core/MultiEnum.h
+++ b/src/Core/MultiEnum.h
@ -12,9 +12,9 @@ struct MultiEnum

    MultiEnum() = default;

-    template <typename ... EnumValues, typename = std::enable_if_t<std::conjunction_v<std::is_same<EnumTypeT, EnumValues>...>>>
-    constexpr explicit MultiEnum(EnumValues ... v)
-        : MultiEnum((toBitFlag(v) | ... | 0u))
+    template <typename... EnumValues>
+    requires std::conjunction_v<std::is_same<EnumTypeT, EnumValues>...>
+    constexpr explicit MultiEnum(EnumValues... v) : MultiEnum((toBitFlag(v) | ... | 0u))
    {}

    template <typename ValueType>
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -500,6 +500,7 @@ class IColumn;
    M(Bool, formatdatetime_parsedatetime_m_is_month_name, true, "Formatter '%M' in functions 'formatDateTime()' and 'parseDateTime()' produces the month name instead of minutes.", 0) \
    \
    M(UInt64, max_partitions_per_insert_block, 100, "Limit maximum number of partitions in single INSERTed block. Zero means unlimited. Throw exception if the block contains too many partitions. This setting is a safety threshold, because using large number of partitions is a common misconception.", 0) \
+    M(Bool, throw_on_max_partitions_per_insert_block, true, "Used with max_partitions_per_insert_block. If true (default), an exception will be thrown when max_partitions_per_insert_block is reached. If false, details of the insert query reaching this limit with the number of partitions will be logged. This can be useful if you're trying to understand the impact on users when changing max_partitions_per_insert_block.", 0) \
    M(Int64, max_partitions_to_read, -1, "Limit the max number of partitions that can be accessed in one query. <= 0 means unlimited.", 0) \
    M(Bool, check_query_single_value_result, true, "Return check query result as single 1/0 value", 0) \
    M(Bool, allow_drop_detached, false, "Allow ALTER TABLE ... DROP DETACHED PART[ITION] ... queries", 0) \
@ -961,7 +962,7 @@ class IColumn;
    M(ParquetVersion, output_format_parquet_version, "2.latest", "Parquet format version for output format. Supported versions: 1.0, 2.4, 2.6 and 2.latest (default)", 0) \
    M(ParquetCompression, output_format_parquet_compression_method, "lz4", "Compression method for Parquet output format. Supported codecs: snappy, lz4, brotli, zstd, gzip, none (uncompressed)", 0) \
    M(Bool, output_format_parquet_compliant_nested_types, true, "In parquet file schema, use name 'element' instead of 'item' for list elements. This is a historical artifact of Arrow library implementation. Generally increases compatibility, except perhaps with some old versions of Arrow.", 0) \
-    M(Bool, output_format_parquet_use_custom_encoder, true, "Use experimental faster Parquet encoder implementation.", 0) \
+    M(Bool, output_format_parquet_use_custom_encoder, false, "Use a faster Parquet encoder implementation.", 0) \
    M(Bool, output_format_parquet_parallel_encoding, true, "Do Parquet encoding in multiple threads. Requires output_format_parquet_use_custom_encoder.", 0) \
    M(UInt64, output_format_parquet_data_page_size, 1024 * 1024, "Target page size in bytes, before compression.", 0) \
    M(UInt64, output_format_parquet_batch_size, 1024, "Check page size every this many rows. Consider decreasing if you have columns with average values size above a few KBs.", 0) \
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@ -138,7 +138,6 @@ IMPLEMENT_SETTING_ENUM(MsgPackUUIDRepresentation, ErrorCodes::BAD_ARGUMENTS,

 IMPLEMENT_SETTING_ENUM(Dialect, ErrorCodes::BAD_ARGUMENTS,
    {{"clickhouse", Dialect::clickhouse},
-     {"kusto", Dialect::kusto},
     {"kusto", Dialect::kusto},
     {"prql", Dialect::prql}})
    // FIXME: do not add 'kusto_auto' to the list. Maybe remove it from code completely?
--- a/src/DataTypes/DataTypeArray.cpp
+++ b/src/DataTypes/DataTypeArray.cpp
@ -11,6 +11,7 @@
 #include <Common/assert_cast.h>

 #include <Core/NamesAndTypes.h>
+#include <Columns/ColumnConst.h>


 namespace DB
@ -20,6 +21,7 @@ namespace ErrorCodes
 {
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
 }
+using FieldType = Array;


 DataTypeArray::DataTypeArray(const DataTypePtr & nested_)
@ -33,7 +35,6 @@ MutableColumnPtr DataTypeArray::createColumn() const
    return ColumnArray::create(nested->createColumn(), ColumnArray::ColumnOffsets::create());
 }

-
 Field DataTypeArray::getDefault() const
 {
    return Array();
--- a/src/DataTypes/DataTypeArray.h
+++ b/src/DataTypes/DataTypeArray.h
@ -2,6 +2,7 @@

 #include <DataTypes/IDataType.h>
 #include <DataTypes/Serializations/SerializationArray.h>
+#include <Columns/ColumnArray.h>


 namespace DB
@ -15,6 +16,8 @@ private:
    DataTypePtr nested;

 public:
+    using FieldType = Array;
+    using ColumnType = ColumnArray;
    static constexpr bool is_parametric = true;

    explicit DataTypeArray(const DataTypePtr & nested_);
@ -42,6 +45,7 @@ public:

    MutableColumnPtr createColumn() const override;

+
    Field getDefault() const override;

    bool equals(const IDataType & rhs) const override;
--- a/src/DataTypes/Serializations/SerializationNullable.cpp
+++ b/src/DataTypes/Serializations/SerializationNullable.cpp
@ -189,10 +189,10 @@ void SerializationNullable::serializeBinary(const IColumn & column, size_t row_n

 /// Deserialize value into ColumnNullable.
 /// We need to insert both to nested column and to null byte map, or, in case of exception, to not insert at all.
-template <typename ReturnType = void, typename CheckForNull, typename DeserializeNested, typename std::enable_if_t<std::is_same_v<ReturnType, void>, ReturnType>* = nullptr>
-static ReturnType safeDeserialize(
-    IColumn & column, const ISerialization &,
-    CheckForNull && check_for_null, DeserializeNested && deserialize_nested)
+template <typename ReturnType = void, typename CheckForNull, typename DeserializeNested, ReturnType * = nullptr>
+requires std::same_as<ReturnType, void>
+static ReturnType
+safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_for_null, DeserializeNested && deserialize_nested)
 {
    ColumnNullable & col = assert_cast<ColumnNullable &>(column);

@ -217,10 +217,10 @@ static ReturnType safeDeserialize(
 }

 /// Deserialize value into non-nullable column. In case of NULL, insert default value and return false.
-template <typename ReturnType = void, typename CheckForNull, typename DeserializeNested, typename std::enable_if_t<std::is_same_v<ReturnType, bool>, ReturnType>* = nullptr>
-static ReturnType safeDeserialize(
-        IColumn & column, const ISerialization &,
-        CheckForNull && check_for_null, DeserializeNested && deserialize_nested)
+template <typename ReturnType = void, typename CheckForNull, typename DeserializeNested, ReturnType * = nullptr>
+requires std::same_as<ReturnType, bool>
+static ReturnType
+safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_for_null, DeserializeNested && deserialize_nested)
 {
    bool insert_default = check_for_null();
    if (insert_default)
--- a/src/Dictionaries/RegExpTreeDictionary.cpp
+++ b/src/Dictionaries/RegExpTreeDictionary.cpp
@ -346,7 +346,7 @@ void RegExpTreeDictionary::loadData()
            ids[i] = static_cast<unsigned>(i+1);

        hs_error_t err = hs_compile_lit_multi(patterns.data(), flags.data(), ids.get(), lengths.data(), static_cast<unsigned>(patterns.size()), HS_MODE_BLOCK, nullptr, &db, &compile_error);
-        origin_db = (db);
+        origin_db.reset(db);
        if (err != HS_SUCCESS)
        {
            /// CompilerError is a unique_ptr, so correct memory free after the exception is thrown.
@ -658,7 +658,7 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
            };

            hs_error_t err = hs_scan(
-                origin_db,
+                origin_db.get(),
                reinterpret_cast<const char *>(keys_data.data()) + offset,
                static_cast<unsigned>(length),
                0,
--- a/src/Dictionaries/RegExpTreeDictionary.h
+++ b/src/Dictionaries/RegExpTreeDictionary.h
@ -199,7 +199,7 @@ private:
    #if USE_VECTORSCAN
    MultiRegexps::DeferredConstructedRegexpsPtr hyperscan_regex;
    MultiRegexps::ScratchPtr origin_scratch;
-    hs_database_t* origin_db;
+    MultiRegexps::DataBasePtr origin_db;
    #endif

    Poco::Logger * logger;
--- a/src/Disks/TemporaryFileOnDisk.cpp
+++ b/src/Disks/TemporaryFileOnDisk.cpp
@ -1,5 +1,4 @@
 #include <Disks/TemporaryFileOnDisk.h>
-#include <Poco/TemporaryFile.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/logger_useful.h>

@ -41,17 +40,9 @@ TemporaryFileOnDisk::TemporaryFileOnDisk(const DiskPtr & disk_, const String & p

    ProfileEvents::increment(ProfileEvents::ExternalProcessingFilesTotal);

-    /// Do not use default temporaty root path `/tmp/tmpXXXXXX`.
-    /// The `dummy_prefix` is used to know what to replace with the real prefix.
-    String dummy_prefix = "a/";
-    relative_path = Poco::TemporaryFile::tempName(dummy_prefix);
-    dummy_prefix += "tmp";
-    /// a/tmpXXXXX -> <prefix>XXXXX
-    assert(relative_path.starts_with(dummy_prefix));
-    relative_path.replace(0, dummy_prefix.length(), prefix);
-
-    if (relative_path.empty())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Temporary file name is empty");
+    /// A disk can be remote and shared between multiple replicas.
+    /// That's why we must not use Poco::TemporaryFile::tempName() here (Poco::TemporaryFile::tempName() can return the same names for different processes on different nodes).
+    relative_path = prefix + toString(UUIDHelpers::generateV4());
 }

 String TemporaryFileOnDisk::getAbsolutePath() const
--- a/src/Functions/FunctionBinaryArithmetic.h
+++ b/src/Functions/FunctionBinaryArithmetic.h
@ -42,6 +42,15 @@
 #include <Common/assert_cast.h>
 #include <Common/typeid_cast.h>
 #include <Common/Arena.h>
+#include <Core/ColumnWithTypeAndName.h>
+#include <base/types.h>
+#include <Columns/ColumnArray.h>
+#include <Columns/IColumn.h>
+#include <Core/ColumnsWithTypeAndName.h>
+#include <DataTypes/IDataType.h>
+#include <DataTypes/getMostSubtype.h>
+#include <base/TypeLists.h>
+#include <DataTypes/DataTypeArray.h>
 #include <DataTypes/DataTypeLowCardinality.h>
 #include <Interpreters/Context.h>

@ -62,6 +71,7 @@ namespace ErrorCodes
    extern const int DECIMAL_OVERFLOW;
    extern const int CANNOT_ADD_DIFFERENT_AGGREGATE_STATES;
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int SIZES_OF_ARRAYS_DONT_MATCH;
 }

 namespace traits_
@ -102,6 +112,9 @@ template <typename DataType> constexpr bool IsFloatingPoint = false;
 template <> inline constexpr bool IsFloatingPoint<DataTypeFloat32> = true;
 template <> inline constexpr bool IsFloatingPoint<DataTypeFloat64> = true;

+template <typename DataType> constexpr bool IsArray = false;
+template <> inline constexpr bool IsArray<DataTypeArray> = true;
+
 template <typename DataType> constexpr bool IsDateOrDateTime = false;
 template <> inline constexpr bool IsDateOrDateTime<DataTypeDate> = true;
 template <> inline constexpr bool IsDateOrDateTime<DataTypeDateTime> = true;
@ -1125,6 +1138,73 @@ class FunctionBinaryArithmetic : public IFunction
        return function->execute(arguments, result_type, input_rows_count);
    }

+    ColumnPtr executeArrayImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const
+    {
+        const auto * return_type_array = checkAndGetDataType<DataTypeArray>(result_type.get());
+
+        if (!return_type_array)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Return type for function {} must be array.", getName());
+
+        auto num_args = arguments.size();
+        DataTypes data_types;
+
+        ColumnsWithTypeAndName new_arguments {num_args};
+        DataTypePtr result_array_type;
+
+        const auto * left_const = typeid_cast<const ColumnConst *>(arguments[0].column.get());
+        const auto * right_const = typeid_cast<const ColumnConst *>(arguments[1].column.get());
+
+        /// Unpacking arrays if both are constants.
+        if (left_const && right_const)
+        {
+            new_arguments[0] = {left_const->getDataColumnPtr(), arguments[0].type, arguments[0].name};
+            new_arguments[1] = {right_const->getDataColumnPtr(), arguments[1].type, arguments[1].name};
+            auto col = executeImpl(new_arguments, result_type, 1);
+            return ColumnConst::create(std::move(col), input_rows_count);
+        }
+
+        /// Unpacking arrays if at least one column is constant.
+        if (left_const || right_const)
+        {
+            new_arguments[0] = {arguments[0].column->convertToFullColumnIfConst(), arguments[0].type, arguments[0].name};
+            new_arguments[1] = {arguments[1].column->convertToFullColumnIfConst(), arguments[1].type, arguments[1].name};
+            return executeImpl(new_arguments, result_type, input_rows_count);
+        }
+
+        const auto * left_array_col = typeid_cast<const ColumnArray *>(arguments[0].column.get());
+        const auto * right_array_col = typeid_cast<const ColumnArray *>(arguments[1].column.get());
+        const auto & left_offsets = left_array_col->getOffsets();
+        const auto & right_offsets = right_array_col->getOffsets();
+
+        chassert(left_offsets.size() == right_offsets.size() && "Unexpected difference in number of offsets");
+        /// Unpacking non-const arrays and checking sizes of them.
+        for (auto offset_index = 0U; offset_index < left_offsets.size(); ++offset_index)
+        {
+            if (left_offsets[offset_index] != right_offsets[offset_index])
+            {
+                throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
+                "Cannot apply operation for arrays of different sizes. Size of the first argument: {}, size of the second argument: {}",
+                *left_array_col->getOffsets().data(),
+                *right_array_col ->getOffsets().data());
+            }
+        }
+
+        const auto & left_array_type = typeid_cast<const DataTypeArray *>(arguments[0].type.get())->getNestedType();
+        new_arguments[0] = {left_array_col->getDataPtr(), left_array_type, arguments[0].name};
+
+        const auto & right_array_type = typeid_cast<const DataTypeArray *>(arguments[1].type.get())->getNestedType();
+        new_arguments[1] = {right_array_col->getDataPtr(), right_array_type, arguments[1].name};
+
+        result_array_type = typeid_cast<const DataTypeArray *>(result_type.get())->getNestedType();
+
+        size_t rows_count = 0;
+        if (!left_offsets.empty())
+            rows_count = left_offsets.back();
+        auto res = executeImpl(new_arguments, result_array_type, rows_count);
+
+        return ColumnArray::create(res, typeid_cast<const ColumnArray *>(arguments[0].column.get())->getOffsetsPtr());
+    }
+
    ColumnPtr executeTupleNumberOperator(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type,
                                               size_t input_rows_count, const FunctionOverloadResolverPtr & function_builder) const
    {
@ -1326,6 +1406,20 @@ public:
            return getReturnTypeImplStatic(new_arguments, context);
        }

+
+        if constexpr (is_plus || is_minus)
+        {
+            if (isArray(arguments[0]) && isArray(arguments[1]))
+            {
+                DataTypes new_arguments {
+                        static_cast<const DataTypeArray &>(*arguments[0]).getNestedType(),
+                        static_cast<const DataTypeArray &>(*arguments[1]).getNestedType(),
+                };
+                return std::make_shared<DataTypeArray>(getReturnTypeImplStatic(new_arguments, context));
+            }
+        }
+
+
        /// Special case when the function is plus or minus, one of arguments is Date/DateTime and another is Interval.
        if (auto function_builder = getFunctionForIntervalArithmetic(arguments[0], arguments[1], context))
        {
@ -2031,6 +2125,9 @@ ColumnPtr executeStringInteger(const ColumnsWithTypeAndName & arguments, const A
                return (res = executeNumeric(arguments, left, right, right_nullmap)) != nullptr;
        });

+        if (isArray(result_type))
+            return executeArrayImpl(arguments, result_type, input_rows_count);
+
        if (!valid)
        {
            // This is a logical error, because the types should have been checked
--- a/src/Functions/FunctionsHashing.h
+++ b/src/Functions/FunctionsHashing.h
@ -534,18 +534,15 @@ struct JavaHashImpl
            static_cast<uint32_t>(x) ^ static_cast<uint32_t>(static_cast<uint64_t>(x) >> 32));
    }

-    template <class T, typename std::enable_if<std::is_same_v<T, int8_t>
-                                                   || std::is_same_v<T, int16_t>
-                                                   || std::is_same_v<T, int32_t>, T>::type * = nullptr>
+    template <class T, T * = nullptr>
+    requires std::same_as<T, int8_t> || std::same_as<T, int16_t> || std::same_as<T, int32_t>
    static ReturnType apply(T x)
    {
        return x;
    }

-    template <typename T, typename std::enable_if<!std::is_same_v<T, int8_t>
-                                                      && !std::is_same_v<T, int16_t>
-                                                      && !std::is_same_v<T, int32_t>
-                                                      && !std::is_same_v<T, int64_t>, T>::type * = nullptr>
+    template <class T, T * = nullptr>
+    requires(!std::same_as<T, int8_t> && !std::same_as<T, int16_t> && !std::same_as<T, int32_t>)
    static ReturnType apply(T x)
    {
        if (std::is_unsigned_v<T>)
--- a/src/Functions/TransformDateTime64.h
+++ b/src/Functions/TransformDateTime64.h
@ -88,8 +88,9 @@ public:
        }
    }

-    template <typename T, typename ... Args, typename = std::enable_if_t<!std::is_same_v<T, DateTime64>>>
-    inline auto execute(const T & t, Args && ... args) const
+    template <typename T, typename... Args>
+    requires (!std::same_as<T, DateTime64>)
+    inline auto execute(const T & t, Args &&... args) const
    {
        return wrapped_transform.execute(t, std::forward<Args>(args)...);
    }
@ -128,7 +129,8 @@ public:
        }
    }

-    template <typename T, typename ... Args, typename = std::enable_if_t<!std::is_same_v<T, DateTime64>>>
+    template <typename T, typename ... Args>
+    requires (!std::same_as<T, DateTime64>)
    inline auto executeExtendedResult(const T & t, Args && ... args) const
    {
        return wrapped_transform.executeExtendedResult(t, std::forward<Args>(args)...);
--- a/src/IO/Archives/ArchiveUtils.h
+++ b/src/IO/Archives/ArchiveUtils.h
@ -0,0 +1,14 @@
+#pragma once
+
+#include "config.h"
+
+#if USE_LIBARCHIVE
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+
+#include <archive.h>
+#include <archive_entry.h>
+#endif
+#endif
--- a/src/IO/Archives/IArchiveReader.h
+++ b/src/IO/Archives/IArchiveReader.h
@ -40,18 +40,26 @@ public:
        virtual bool nextFile() = 0;
    };

+    virtual const std::string & getPath() const = 0;
+
    /// Starts enumerating files in the archive.
    virtual std::unique_ptr<FileEnumerator> firstFile() = 0;

+    using NameFilter = std::function<bool(const std::string &)>;
+
    /// Starts reading a file from the archive. The function returns a read buffer,
    /// you can read that buffer to extract uncompressed data from the archive.
    /// Several read buffers can be used at the same time in parallel.
    virtual std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename) = 0;
+    virtual std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter) = 0;

    /// It's possible to convert a file enumerator to a read buffer and vice versa.
    virtual std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) = 0;
    virtual std::unique_ptr<FileEnumerator> nextFile(std::unique_ptr<ReadBuffer> read_buffer) = 0;

+    virtual std::vector<std::string> getAllFiles() = 0;
+    virtual std::vector<std::string> getAllFiles(NameFilter filter) = 0;
+
    /// Sets password used to decrypt files in the archive.
    virtual void setPassword(const String & /* password */) {}

--- a/src/IO/Archives/LibArchiveReader.cpp
+++ b/src/IO/Archives/LibArchiveReader.cpp
@ -0,0 +1,348 @@
+#include <IO/Archives/LibArchiveReader.h>
+#include <IO/ReadBufferFromFileBase.h>
+#include <Common/quoteString.h>
+#include <Common/scope_guard_safe.h>
+
+#include <IO/Archives/ArchiveUtils.h>
+
+#include <mutex>
+
+namespace DB
+{
+
+#if USE_LIBARCHIVE
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_UNPACK_ARCHIVE;
+    extern const int LOGICAL_ERROR;
+    extern const int CANNOT_READ_ALL_DATA;
+    extern const int UNSUPPORTED_METHOD;
+}
+
+class LibArchiveReader::Handle
+{
+public:
+    explicit Handle(std::string path_to_archive_, bool lock_on_reading_)
+        : path_to_archive(path_to_archive_), lock_on_reading(lock_on_reading_)
+    {
+        current_archive = open(path_to_archive);
+    }
+
+    Handle(const Handle &) = delete;
+    Handle(Handle && other) noexcept
+        : current_archive(other.current_archive)
+        , current_entry(other.current_entry)
+        , lock_on_reading(other.lock_on_reading)
+    {
+        other.current_archive = nullptr;
+        other.current_entry = nullptr;
+    }
+
+    ~Handle()
+    {
+        close(current_archive);
+    }
+
+    bool locateFile(const std::string & filename)
+    {
+        return locateFile([&](const std::string & file) { return file == filename; });
+    }
+
+    bool locateFile(NameFilter filter)
+    {
+        resetFileInfo();
+        int err = ARCHIVE_OK;
+        while (true)
+        {
+            err = readNextHeader(current_archive, &current_entry);
+
+            if (err == ARCHIVE_RETRY)
+                continue;
+
+            if (err != ARCHIVE_OK)
+                break;
+
+            if (filter(archive_entry_pathname(current_entry)))
+                return true;
+        }
+
+        checkError(err);
+        return false;
+    }
+
+    bool nextFile()
+    {
+        resetFileInfo();
+        int err = ARCHIVE_OK;
+        do
+        {
+            err = readNextHeader(current_archive, &current_entry);
+        } while (err == ARCHIVE_RETRY);
+
+        checkError(err);
+        return err == ARCHIVE_OK;
+    }
+
+    std::vector<std::string> getAllFiles(NameFilter filter)
+    {
+        auto * archive = open(path_to_archive);
+        SCOPE_EXIT(
+            close(archive);
+        );
+
+        struct archive_entry * entry = nullptr;
+
+        std::vector<std::string> files;
+        int error = readNextHeader(archive, &entry);
+        while (error == ARCHIVE_OK || error == ARCHIVE_RETRY)
+        {
+            chassert(entry != nullptr);
+            std::string name = archive_entry_pathname(entry);
+            if (!filter || filter(name))
+                files.push_back(std::move(name));
+
+            error = readNextHeader(archive, &entry);
+        }
+
+        checkError(error);
+        return files;
+    }
+
+    const String & getFileName() const
+    {
+        chassert(current_entry);
+        if (!file_name)
+            file_name.emplace(archive_entry_pathname(current_entry));
+
+        return *file_name;
+    }
+
+    const FileInfo & getFileInfo() const
+    {
+        chassert(current_entry);
+        if (!file_info)
+        {
+            file_info.emplace();
+            file_info->uncompressed_size = archive_entry_size(current_entry);
+            file_info->compressed_size = archive_entry_size(current_entry);
+            file_info->is_encrypted = false;
+        }
+
+        return *file_info;
+    }
+
+    struct archive * current_archive;
+    struct archive_entry * current_entry = nullptr;
+private:
+    void checkError(int error) const
+    {
+        if (error == ARCHIVE_FATAL)
+            throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Failed to read archive while fetching all files: {}", archive_error_string(current_archive));
+    }
+
+    void resetFileInfo()
+    {
+        file_name.reset();
+        file_info.reset();
+    }
+
+    static struct archive * open(const String & path_to_archive)
+    {
+        auto * archive = archive_read_new();
+        try
+        {
+            archive_read_support_filter_all(archive);
+            archive_read_support_format_all(archive);
+            if (archive_read_open_filename(archive, path_to_archive.c_str(), 10240) != ARCHIVE_OK)
+                throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't open archive: {}", quoteString(path_to_archive));
+        }
+        catch (...)
+        {
+            close(archive);
+            throw;
+        }
+
+        return archive;
+    }
+
+    static void close(struct archive * archive)
+    {
+        if (archive)
+        {
+            archive_read_close(archive);
+            archive_read_free(archive);
+        }
+    }
+
+    int readNextHeader(struct archive * archive, struct archive_entry ** entry) const
+    {
+        std::unique_lock lock(Handle::read_lock, std::defer_lock);
+        if (lock_on_reading)
+            lock.lock();
+
+        return archive_read_next_header(archive, entry);
+    }
+
+    const String path_to_archive;
+
+    /// for some archive types when we are reading headers static variables are used
+    /// which are not thread-safe
+    const bool lock_on_reading;
+    static inline std::mutex read_lock;
+
+    mutable std::optional<String> file_name;
+    mutable std::optional<FileInfo> file_info;
+};
+
+class LibArchiveReader::FileEnumeratorImpl : public FileEnumerator
+{
+public:
+    explicit FileEnumeratorImpl(Handle handle_) : handle(std::move(handle_)) {}
+
+    const String & getFileName() const override { return handle.getFileName(); }
+    const FileInfo & getFileInfo() const override { return handle.getFileInfo(); }
+    bool nextFile() override { return handle.nextFile(); }
+
+    /// Releases owned handle to pass it to a read buffer.
+    Handle releaseHandle() && { return std::move(handle); }
+private:
+    Handle handle;
+};
+
+class LibArchiveReader::ReadBufferFromLibArchive : public ReadBufferFromFileBase
+{
+public:
+    explicit ReadBufferFromLibArchive(Handle handle_, std::string path_to_archive_)
+        : ReadBufferFromFileBase(DBMS_DEFAULT_BUFFER_SIZE, nullptr, 0)
+        , handle(std::move(handle_))
+        , path_to_archive(std::move(path_to_archive_))
+    {}
+
+    off_t seek(off_t /* off */, int /* whence */) override
+    {
+        throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Seek is not supported when reading from archive");
+    }
+
+    off_t getPosition() override
+    {
+        throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "getPosition not supported when reading from archive");
+    }
+
+    String getFileName() const override { return handle.getFileName(); }
+
+    Handle releaseHandle() &&
+    {
+        return std::move(handle);
+    }
+
+private:
+    bool nextImpl() override
+    {
+        auto bytes_read = archive_read_data(handle.current_archive, internal_buffer.begin(), static_cast<int>(internal_buffer.size()));
+
+        if (bytes_read < 0)
+            throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Failed to read file {} from {}: {}", handle.getFileName(), path_to_archive, archive_error_string(handle.current_archive));
+
+        if (!bytes_read)
+            return false;
+
+        total_bytes_read += bytes;
+
+        working_buffer = internal_buffer;
+        working_buffer.resize(bytes_read);
+        return true;
+    }
+
+    Handle handle;
+    const String path_to_archive;
+    size_t total_bytes_read = 0;
+};
+
+LibArchiveReader::LibArchiveReader(std::string archive_name_, bool lock_on_reading_, std::string path_to_archive_)
+    : archive_name(std::move(archive_name_)), lock_on_reading(lock_on_reading_), path_to_archive(std::move(path_to_archive_))
+{}
+
+LibArchiveReader::~LibArchiveReader() = default;
+
+const std::string & LibArchiveReader::getPath() const
+{
+    return path_to_archive;
+}
+
+bool LibArchiveReader::fileExists(const String & filename)
+{
+    Handle handle(path_to_archive, lock_on_reading);
+    return handle.locateFile(filename);
+}
+
+LibArchiveReader::FileInfo LibArchiveReader::getFileInfo(const String & filename)
+{
+    Handle handle(path_to_archive, lock_on_reading);
+    if (!handle.locateFile(filename))
+        throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't unpack archive {}: file not found", path_to_archive);
+    return handle.getFileInfo();
+}
+
+std::unique_ptr<LibArchiveReader::FileEnumerator> LibArchiveReader::firstFile()
+{
+    Handle handle(path_to_archive, lock_on_reading);
+    if (!handle.nextFile())
+        return nullptr;
+
+    return std::make_unique<FileEnumeratorImpl>(std::move(handle));
+}
+
+std::unique_ptr<ReadBufferFromFileBase> LibArchiveReader::readFile(const String & filename)
+{
+    return readFile([&](const std::string & file) { return file == filename; });
+}
+
+std::unique_ptr<ReadBufferFromFileBase> LibArchiveReader::readFile(NameFilter filter)
+{
+    Handle handle(path_to_archive, lock_on_reading);
+    if (!handle.locateFile(filter))
+        throw Exception(
+            ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't unpack archive {}: no file found satisfying the filter", path_to_archive);
+    return std::make_unique<ReadBufferFromLibArchive>(std::move(handle), path_to_archive);
+}
+
+std::unique_ptr<ReadBufferFromFileBase> LibArchiveReader::readFile(std::unique_ptr<FileEnumerator> enumerator)
+{
+    if (!dynamic_cast<FileEnumeratorImpl *>(enumerator.get()))
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong enumerator passed to readFile()");
+    auto enumerator_impl = std::unique_ptr<FileEnumeratorImpl>(static_cast<FileEnumeratorImpl *>(enumerator.release()));
+    auto handle = std::move(*enumerator_impl).releaseHandle();
+    return std::make_unique<ReadBufferFromLibArchive>(std::move(handle), path_to_archive);
+}
+
+std::unique_ptr<LibArchiveReader::FileEnumerator> LibArchiveReader::nextFile(std::unique_ptr<ReadBuffer> read_buffer)
+{
+    if (!dynamic_cast<ReadBufferFromLibArchive *>(read_buffer.get()))
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong ReadBuffer passed to nextFile()");
+    auto read_buffer_from_libarchive = std::unique_ptr<ReadBufferFromLibArchive>(static_cast<ReadBufferFromLibArchive *>(read_buffer.release()));
+    auto handle = std::move(*read_buffer_from_libarchive).releaseHandle();
+    if (!handle.nextFile())
+        return nullptr;
+    return std::make_unique<FileEnumeratorImpl>(std::move(handle));
+}
+
+std::vector<std::string> LibArchiveReader::getAllFiles()
+{
+    return getAllFiles({});
+}
+
+std::vector<std::string> LibArchiveReader::getAllFiles(NameFilter filter)
+{
+    Handle handle(path_to_archive, lock_on_reading);
+    return handle.getAllFiles(filter);
+}
+
+void LibArchiveReader::setPassword(const String & /*password_*/)
+{
+    throw Exception(ErrorCodes::LOGICAL_ERROR, "Can not set password to {} archive", archive_name);
+}
+
+#endif
+
+}
--- a/src/IO/Archives/LibArchiveReader.h
+++ b/src/IO/Archives/LibArchiveReader.h
@ -0,0 +1,79 @@
+#pragma once
+
+#include "config.h"
+
+#include <IO/Archives/IArchiveReader.h>
+
+#include <iostream>
+
+namespace DB
+{
+
+#if USE_LIBARCHIVE
+
+class ReadBuffer;
+class ReadBufferFromFileBase;
+class SeekableReadBuffer;
+
+/// Implementation of IArchiveReader for reading archives using libarchive.
+class LibArchiveReader : public IArchiveReader
+{
+public:
+    ~LibArchiveReader() override;
+
+    const std::string & getPath() const override;
+
+    /// Returns true if there is a specified file in the archive.
+    bool fileExists(const String & filename) override;
+
+    /// Returns the information about a file stored in the archive.
+    FileInfo getFileInfo(const String & filename) override;
+
+    /// Starts enumerating files in the archive.
+    std::unique_ptr<FileEnumerator> firstFile() override;
+
+    /// Starts reading a file from the archive. The function returns a read buffer,
+    /// you can read that buffer to extract uncompressed data from the archive.
+    /// Several read buffers can be used at the same time in parallel.
+    std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename) override;
+    std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter) override;
+
+    /// It's possible to convert a file enumerator to a read buffer and vice versa.
+    std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) override;
+    std::unique_ptr<FileEnumerator> nextFile(std::unique_ptr<ReadBuffer> read_buffer) override;
+
+    std::vector<std::string> getAllFiles() override;
+    std::vector<std::string> getAllFiles(NameFilter filter) override;
+
+    /// Sets password used to decrypt the contents of the files in the archive.
+    void setPassword(const String & password_) override;
+
+protected:
+    /// Constructs an archive's reader that will read from a file in the local filesystem.
+    LibArchiveReader(std::string archive_name_, bool lock_on_reading_, std::string path_to_archive_);
+
+private:
+    class ReadBufferFromLibArchive;
+    class Handle;
+    class FileEnumeratorImpl;
+
+    const std::string archive_name;
+    const bool lock_on_reading;
+    const String path_to_archive;
+};
+
+class TarArchiveReader : public LibArchiveReader
+{
+public:
+    explicit TarArchiveReader(std::string path_to_archive) : LibArchiveReader("tar", /*lock_on_reading_=*/ true, std::move(path_to_archive)) { }
+};
+
+class SevenZipArchiveReader : public LibArchiveReader
+{
+public:
+    explicit SevenZipArchiveReader(std::string path_to_archive) : LibArchiveReader("7z", /*lock_on_reading_=*/ false, std::move(path_to_archive)) { }
+};
+
+#endif
+
+}
--- a/src/IO/Archives/ZipArchiveReader.cpp
+++ b/src/IO/Archives/ZipArchiveReader.cpp
@ -85,6 +85,26 @@ public:
        file_name = file_name_;
    }

+    void locateFile(NameFilter filter)
+    {
+        int err = unzGoToFirstFile(raw_handle);
+        if (err == UNZ_END_OF_LIST_OF_FILE)
+            showError("No file was found satisfying the filter");
+
+        do
+        {
+            checkResult(err);
+            resetFileInfo();
+            retrieveFileInfo();
+            if (filter(getFileName()))
+                return;
+
+            err = unzGoToNextFile(raw_handle);
+        } while (err != UNZ_END_OF_LIST_OF_FILE);
+
+        showError("No file was found satisfying the filter");
+    }
+
    bool tryLocateFile(const String & file_name_)
    {
        resetFileInfo();
@ -131,6 +151,27 @@ public:
        return *file_info;
    }

+    std::vector<std::string> getAllFiles(NameFilter filter)
+    {
+        std::vector<std::string> files;
+        resetFileInfo();
+        int err = unzGoToFirstFile(raw_handle);
+        if (err == UNZ_END_OF_LIST_OF_FILE)
+            return files;
+
+        do
+        {
+            checkResult(err);
+            resetFileInfo();
+            retrieveFileInfo();
+            if (!filter || filter(getFileName()))
+                files.push_back(*file_name);
+            err = unzGoToNextFile(raw_handle);
+        } while (err != UNZ_END_OF_LIST_OF_FILE);
+
+        return files;
+    }
+
    void closeFile()
    {
        int err = unzCloseCurrentFile(raw_handle);
@ -459,6 +500,11 @@ ZipArchiveReader::~ZipArchiveReader()
    }
 }

+const std::string & ZipArchiveReader::getPath() const
+{
+    return path_to_archive;
+}
+
 bool ZipArchiveReader::fileExists(const String & filename)
 {
    return acquireHandle().tryLocateFile(filename);
@ -486,6 +532,13 @@ std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(const String
    return std::make_unique<ReadBufferFromZipArchive>(std::move(handle));
 }

+std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(NameFilter filter)
+{
+    auto handle = acquireHandle();
+    handle.locateFile(filter);
+    return std::make_unique<ReadBufferFromZipArchive>(std::move(handle));
+}
+
 std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(std::unique_ptr<FileEnumerator> enumerator)
 {
    if (!dynamic_cast<FileEnumeratorImpl *>(enumerator.get()))
@ -506,6 +559,17 @@ std::unique_ptr<ZipArchiveReader::FileEnumerator> ZipArchiveReader::nextFile(std
    return std::make_unique<FileEnumeratorImpl>(std::move(handle));
 }

+std::vector<std::string> ZipArchiveReader::getAllFiles()
+{
+    return getAllFiles({});
+}
+
+std::vector<std::string> ZipArchiveReader::getAllFiles(NameFilter filter)
+{
+    auto handle = acquireHandle();
+    return handle.getAllFiles(filter);
+}
+
 void ZipArchiveReader::setPassword(const String & password_)
 {
    std::lock_guard lock{mutex};
--- a/src/IO/Archives/ZipArchiveReader.h
+++ b/src/IO/Archives/ZipArchiveReader.h
@ -27,6 +27,8 @@ public:

    ~ZipArchiveReader() override;

+    const std::string & getPath() const override;
+
    /// Returns true if there is a specified file in the archive.
    bool fileExists(const String & filename) override;

@ -40,11 +42,15 @@ public:
    /// you can read that buffer to extract uncompressed data from the archive.
    /// Several read buffers can be used at the same time in parallel.
    std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename) override;
+    std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter) override;

    /// It's possible to convert a file enumerator to a read buffer and vice versa.
    std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) override;
    std::unique_ptr<FileEnumerator> nextFile(std::unique_ptr<ReadBuffer> read_buffer) override;

+    std::vector<std::string> getAllFiles() override;
+    std::vector<std::string> getAllFiles(NameFilter filter) override;
+
    /// Sets password used to decrypt the contents of the files in the archive.
    void setPassword(const String & password_) override;

--- a/src/IO/Archives/createArchiveReader.cpp
+++ b/src/IO/Archives/createArchiveReader.cpp
@ -1,5 +1,6 @@
 #include <IO/Archives/createArchiveReader.h>
 #include <IO/Archives/ZipArchiveReader.h>
+#include <IO/Archives/LibArchiveReader.h>
 #include <Common/Exception.h>


@ -29,10 +30,28 @@ std::shared_ptr<IArchiveReader> createArchiveReader(
        return std::make_shared<ZipArchiveReader>(path_to_archive, archive_read_function, archive_size);
 #else
        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "minizip library is disabled");
+#endif
+    }
+    else if (path_to_archive.ends_with(".tar") || path_to_archive.ends_with("tar.gz"))
+    {
+#if USE_LIBARCHIVE
+        return std::make_shared<TarArchiveReader>(path_to_archive);
+#else
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "libarchive library is disabled");
+#endif
+    }
+    else if (path_to_archive.ends_with(".7z"))
+    {
+#if USE_LIBARCHIVE
+        return std::make_shared<SevenZipArchiveReader>(path_to_archive);
+#else
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "libarchive library is disabled");
 #endif
    }
    else
+    {
        throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Cannot determine the type of archive {}", path_to_archive);
+    }
 }

 }
--- a/src/IO/MySQLPacketPayloadReadBuffer.cpp
+++ b/src/IO/MySQLPacketPayloadReadBuffer.cpp
@ -45,6 +45,9 @@ bool MySQLPacketPayloadReadBuffer::nextImpl()
    }

    in.nextIfAtEnd();
+    /// Don't return a buffer when no bytes available
+    if (!in.hasPendingData())
+        return false;
    working_buffer = ReadBuffer::Buffer(in.position(), in.buffer().end());
    size_t count = std::min(in.available(), payload_length - offset);
    working_buffer.resize(count);
--- a/src/IO/ParallelReadBuffer.cpp
+++ b/src/IO/ParallelReadBuffer.cpp
@ -8,7 +8,7 @@ namespace DB

 namespace ErrorCodes
 {
-    extern const int LOGICAL_ERROR;
+    extern const int UNEXPECTED_END_OF_FILE;
    extern const int CANNOT_SEEK_THROUGH_FILE;
    extern const int SEEK_POSITION_OUT_OF_BOUND;

@ -260,7 +260,7 @@ void ParallelReadBuffer::readerThreadFunction(ReadWorkerPtr read_worker)

        if (!on_progress(r) && r < read_worker->segment.size())
            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
+                ErrorCodes::UNEXPECTED_END_OF_FILE,
                "Failed to read all the data from the reader at offset {}, got {}/{} bytes",
                read_worker->start_offset, r, read_worker->segment.size());
    }
--- a/src/IO/S3/PocoHTTPClient.cpp
+++ b/src/IO/S3/PocoHTTPClient.cpp
@ -536,7 +536,10 @@ void PocoHTTPClient::makeRequestInternalImpl(
    }
    catch (...)
    {
-        tryLogCurrentException(log, fmt::format("Failed to make request to: {}", uri));
+        auto error_message = getCurrentExceptionMessageAndPattern(/* with_stacktrace */ true);
+        error_message.text = fmt::format("Failed to make request to: {}: {}", uri, error_message.text);
+        LOG_INFO(log, error_message);
+
        response->SetClientErrorType(Aws::Client::CoreErrors::NETWORK_CONNECTION);
        response->SetClientErrorMessage(getCurrentExceptionMessage(false));

--- a/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp
+++ b/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp
@ -42,13 +42,50 @@ void ZstdDeflatingAppendableWriteBuffer::nextImpl()
    if (!offset())
        return;

+    input.src = reinterpret_cast<unsigned char *>(working_buffer.begin());
+    input.size = offset();
+    input.pos = 0;
+
    if (first_write && append_to_existing_file && isNeedToAddEmptyBlock())
    {
        addEmptyBlock();
        first_write = false;
    }

-    flush(ZSTD_e_flush);
+    try
+    {
+        bool ended = false;
+        do
+        {
+            out->nextIfAtEnd();
+
+            output.dst = reinterpret_cast<unsigned char *>(out->buffer().begin());
+            output.size = out->buffer().size();
+            output.pos = out->offset();
+
+            size_t compression_result = ZSTD_compressStream2(cctx, &output, &input, ZSTD_e_flush);
+            if (ZSTD_isError(compression_result))
+                throw Exception(
+                                ErrorCodes::ZSTD_ENCODER_FAILED,
+                                "ZSTD stream decoding failed: error code: {}; ZSTD version: {}",
+                                ZSTD_getErrorName(compression_result), ZSTD_VERSION_STRING);
+
+            first_write = false;
+            out->position() = out->buffer().begin() + output.pos;
+
+            bool everything_was_compressed = (input.pos == input.size);
+            bool everything_was_flushed = compression_result == 0;
+
+            ended = everything_was_compressed && everything_was_flushed;
+        } while (!ended);
+    }
+    catch (...)
+    {
+        /// Do not try to write next time after exception.
+        out->position() = out->buffer().begin();
+        throw;
+    }
+
 }

 ZstdDeflatingAppendableWriteBuffer::~ZstdDeflatingAppendableWriteBuffer()
@ -66,58 +103,58 @@ void ZstdDeflatingAppendableWriteBuffer::finalizeImpl()
    }
    else
    {
-        finalizeBefore();
-        out->finalize();
-        finalizeAfter();
+        try
+        {
+            finalizeBefore();
+            out->finalize();
+            finalizeAfter();
+        }
+        catch (...)
+        {
+            /// Do not try to flush next time after exception.
+            out->position() = out->buffer().begin();
+            throw;
+        }
    }
 }

 void ZstdDeflatingAppendableWriteBuffer::finalizeBefore()
 {
-    /// Actually we can use ZSTD_e_flush here and add empty termination
-    /// block on each new buffer creation for non-empty file unconditionally (without isNeedToAddEmptyBlock).
-    /// However ZSTD_decompressStream is able to read non-terminated frame (we use it in reader buffer),
-    /// but console zstd utility cannot.
-    flush(ZSTD_e_end);
-}
+    next();
+
+    out->nextIfAtEnd();

-void ZstdDeflatingAppendableWriteBuffer::flush(ZSTD_EndDirective mode)
-{
    input.src = reinterpret_cast<unsigned char *>(working_buffer.begin());
    input.size = offset();
    input.pos = 0;

-    try
-    {
-        bool ended = false;
-        do
-        {
-            out->nextIfAtEnd();
+    output.dst = reinterpret_cast<unsigned char *>(out->buffer().begin());
+    output.size = out->buffer().size();
+    output.pos = out->offset();

+    /// Actually we can use ZSTD_e_flush here and add empty termination
+    /// block on each new buffer creation for non-empty file unconditionally (without isNeedToAddEmptyBlock).
+    /// However ZSTD_decompressStream is able to read non-terminated frame (we use it in reader buffer),
+    /// but console zstd utility cannot.
+    size_t remaining = ZSTD_compressStream2(cctx, &output, &input, ZSTD_e_end);
+    while (remaining != 0)
+    {
+        if (ZSTD_isError(remaining))
+            throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED,
+                            "ZSTD stream encoder end failed: error: '{}' ZSTD version: {}",
+                            ZSTD_getErrorName(remaining), ZSTD_VERSION_STRING);
+
+        remaining = ZSTD_compressStream2(cctx, &output, &input, ZSTD_e_end);
+
+        out->position() = out->buffer().begin() + output.pos;
+
+        if (!out->hasPendingData())
+        {
+            out->next();
            output.dst = reinterpret_cast<unsigned char *>(out->buffer().begin());
            output.size = out->buffer().size();
            output.pos = out->offset();
-
-            size_t compression_result = ZSTD_compressStream2(cctx, &output, &input, mode);
-            if (ZSTD_isError(compression_result))
-                throw Exception(
-                                ErrorCodes::ZSTD_ENCODER_FAILED,
-                                "ZSTD stream decoding failed: error code: {}; ZSTD version: {}",
-                                ZSTD_getErrorName(compression_result), ZSTD_VERSION_STRING);
-
-            out->position() = out->buffer().begin() + output.pos;
-
-            bool everything_was_compressed = (input.pos == input.size);
-            bool everything_was_flushed = compression_result == 0;
-
-            ended = everything_was_compressed && everything_was_flushed;
-        } while (!ended);
-    }
-    catch (...)
-    {
-        /// Do not try to write next time after exception.
-        out->position() = out->buffer().begin();
-        throw;
+        }
    }
 }

--- a/src/IO/ZstdDeflatingAppendableWriteBuffer.h
+++ b/src/IO/ZstdDeflatingAppendableWriteBuffer.h
@ -52,8 +52,6 @@ private:
    /// NOTE: will fill compressed data to the out.working_buffer, but will not call out.next method until the buffer is full
    void nextImpl() override;

-    void flush(ZSTD_EndDirective mode);
-
    /// Write terminating ZSTD_e_end: empty block + frame epilogue. BTW it
    /// should be almost noop, because frame epilogue contains only checksums,
    /// and they are disabled for this buffer.
--- a/src/IO/tests/gtest_archive_reader_and_writer.cpp
+++ b/src/IO/tests/gtest_archive_reader_and_writer.cpp
@ -1,6 +1,7 @@
 #include <gtest/gtest.h>
 #include "config.h"

+#include <IO/Archives/ArchiveUtils.h>
 #include <IO/Archives/IArchiveReader.h>
 #include <IO/Archives/IArchiveWriter.h>
 #include <IO/Archives/createArchiveReader.h>
@ -19,11 +20,52 @@
 namespace DB::ErrorCodes
 {
    extern const int CANNOT_UNPACK_ARCHIVE;
+    extern const int LOGICAL_ERROR;
 }

 namespace fs = std::filesystem;
 using namespace DB;

+enum class ArchiveType : uint8_t
+{
+    Tar,
+    SevenZip
+};
+
+template <ArchiveType archive_type>
+bool createArchiveWithFiles(const std::string & archivename, const std::map<std::string, std::string> & files)
+{
+    struct archive * a;
+    struct archive_entry * entry;
+
+    a = archive_write_new();
+
+    if constexpr (archive_type == ArchiveType::Tar)
+        archive_write_set_format_pax_restricted(a);
+    else if constexpr (archive_type == ArchiveType::SevenZip)
+        archive_write_set_format_7zip(a);
+    else
+        throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Invalid archive type requested: {}", static_cast<size_t>(archive_type));
+
+    archive_write_open_filename(a, archivename.c_str());
+
+    for (const auto & [filename, content] : files) {
+        entry = archive_entry_new();
+        archive_entry_set_pathname(entry, filename.c_str());
+        archive_entry_set_size(entry, content.size());
+        archive_entry_set_mode(entry, S_IFREG | 0644); // regular file with rw-r--r-- permissions
+        archive_entry_set_mtime(entry, time(nullptr), 0);
+        archive_write_header(a, entry);
+        archive_write_data(a, content.c_str(), content.size());
+        archive_entry_free(entry);
+    }
+    
+    archive_write_close(a);
+    archive_write_free(a);
+
+    return true;
+
+}

 class ArchiveReaderAndWriterTest : public ::testing::TestWithParam<const char *>
 {
@ -327,6 +369,127 @@ TEST_P(ArchiveReaderAndWriterTest, ArchiveNotExist)
                    [&]{ createArchiveReader(getPathToArchive()); });
 }

+TEST(TarArchiveReaderTest, FileExists) {
+    String archive_path = "archive.tar";
+    String filename = "file.txt";
+    String contents = "test";
+    bool created = createArchiveWithFiles<ArchiveType::Tar>(archive_path, {{filename, contents}});
+    EXPECT_EQ(created, true);
+    auto reader = createArchiveReader(archive_path);
+    EXPECT_EQ(reader->fileExists(filename), true);
+    fs::remove(archive_path);
+}
+
+TEST(TarArchiveReaderTest, ReadFile) {
+    String archive_path = "archive.tar";
+    String filename = "file.txt";
+    String contents = "test";
+    bool created = createArchiveWithFiles<ArchiveType::Tar>(archive_path, {{filename, contents}});
+    EXPECT_EQ(created, true);
+    auto reader = createArchiveReader(archive_path);
+    auto in = reader->readFile(filename);
+    String str;
+    readStringUntilEOF(str, *in);
+    EXPECT_EQ(str, contents);
+    fs::remove(archive_path);
+}
+
+TEST(TarArchiveReaderTest, ReadTwoFiles) {
+    String archive_path = "archive.tar";
+    String file1 = "file1.txt";
+    String contents1 = "test1";
+    String file2 = "file2.txt";
+    String contents2 = "test2";
+    bool created = createArchiveWithFiles<ArchiveType::Tar>(archive_path, {{file1, contents1}, {file2, contents2}});
+    EXPECT_EQ(created, true);
+    auto reader = createArchiveReader(archive_path);
+    EXPECT_EQ(reader->fileExists(file1), true);
+    EXPECT_EQ(reader->fileExists(file2), true);
+    auto in = reader->readFile(file1);
+    String str;
+    readStringUntilEOF(str, *in);
+    EXPECT_EQ(str, contents1);
+    in = reader->readFile(file2);
+    
+    readStringUntilEOF(str, *in);
+    EXPECT_EQ(str, contents2);
+    fs::remove(archive_path);
+}
+
+
+TEST(TarArchiveReaderTest, CheckFileInfo) {
+    String archive_path = "archive.tar";
+    String filename = "file.txt";
+    String contents = "test";
+    bool created = createArchiveWithFiles<ArchiveType::Tar>(archive_path, {{filename, contents}});
+    EXPECT_EQ(created, true);
+    auto reader = createArchiveReader(archive_path);
+    auto info = reader->getFileInfo(filename);
+    EXPECT_EQ(info.uncompressed_size, contents.size());
+    EXPECT_GT(info.compressed_size, 0);
+    fs::remove(archive_path);
+}
+
+TEST(SevenZipArchiveReaderTest, FileExists) {
+    String archive_path = "archive.7z";
+    String filename = "file.txt";
+    String contents = "test";
+    bool created = createArchiveWithFiles<ArchiveType::SevenZip>(archive_path, {{filename, contents}});
+    EXPECT_EQ(created, true);
+    auto reader = createArchiveReader(archive_path);
+    EXPECT_EQ(reader->fileExists(filename), true);
+    fs::remove(archive_path);
+}
+
+TEST(SevenZipArchiveReaderTest, ReadFile) {
+    String archive_path = "archive.7z";
+    String filename = "file.txt";
+    String contents = "test";
+    bool created = createArchiveWithFiles<ArchiveType::SevenZip>(archive_path, {{filename, contents}});
+    EXPECT_EQ(created, true);
+    auto reader = createArchiveReader(archive_path);
+    auto in = reader->readFile(filename);
+    String str;
+    readStringUntilEOF(str, *in);
+    EXPECT_EQ(str, contents);
+    fs::remove(archive_path);
+}
+
+TEST(SevenZipArchiveReaderTest, CheckFileInfo) {
+    String archive_path = "archive.7z";
+    String filename = "file.txt";
+    String contents = "test";
+    bool created = createArchiveWithFiles<ArchiveType::SevenZip>(archive_path, {{filename, contents}});
+    EXPECT_EQ(created, true);
+    auto reader = createArchiveReader(archive_path);
+    auto info = reader->getFileInfo(filename);
+    EXPECT_EQ(info.uncompressed_size, contents.size());
+    EXPECT_GT(info.compressed_size, 0);
+    fs::remove(archive_path);
+}
+
+TEST(SevenZipArchiveReaderTest, ReadTwoFiles) {
+    String archive_path = "archive.7z";
+    String file1 = "file1.txt";
+    String contents1 = "test1";
+    String file2 = "file2.txt";
+    String contents2 = "test2";
+    bool created = createArchiveWithFiles<ArchiveType::SevenZip>(archive_path, {{file1, contents1}, {file2, contents2}});
+    EXPECT_EQ(created, true);
+    auto reader = createArchiveReader(archive_path);
+    EXPECT_EQ(reader->fileExists(file1), true);
+    EXPECT_EQ(reader->fileExists(file2), true);
+    auto in = reader->readFile(file1);
+    String str;
+    readStringUntilEOF(str, *in);
+    EXPECT_EQ(str, contents1);
+    in = reader->readFile(file2);
+    
+    readStringUntilEOF(str, *in);
+    EXPECT_EQ(str, contents2);
+    fs::remove(archive_path);
+}
+

 #if USE_MINIZIP

@ -334,7 +497,7 @@ namespace
 {
    const char * supported_archive_file_exts[] =
    {
-        ".zip",
+        ".zip"
    };
 }

--- a/src/Interpreters/ActionsVisitor.cpp
+++ b/src/Interpreters/ActionsVisitor.cpp
@ -94,38 +94,6 @@ static size_t getTypeDepth(const DataTypePtr & type)
    return 0;
 }

-template <typename T>
-static bool decimalEqualsFloat(Field field, Float64 float_value)
-{
-    auto decimal_field = field.get<DecimalField<T>>();
-    auto decimal_to_float = DecimalUtils::convertTo<Float64>(decimal_field.getValue(), decimal_field.getScale());
-    return decimal_to_float == float_value;
-}
-
-/// Applies stricter rules than convertFieldToType:
-/// Doesn't allow :
-/// - loss of precision converting to Decimal
-static bool convertFieldToTypeStrict(const Field & from_value, const IDataType & to_type, Field & result_value)
-{
-    result_value = convertFieldToType(from_value, to_type);
-    if (Field::isDecimal(from_value.getType()) && Field::isDecimal(result_value.getType()))
-        return applyVisitor(FieldVisitorAccurateEquals{}, from_value, result_value);
-    if (from_value.getType() == Field::Types::Float64 && Field::isDecimal(result_value.getType()))
-    {
-        /// Convert back to Float64 and compare
-        if (result_value.getType() == Field::Types::Decimal32)
-            return decimalEqualsFloat<Decimal32>(result_value, from_value.get<Float64>());
-        if (result_value.getType() == Field::Types::Decimal64)
-            return decimalEqualsFloat<Decimal64>(result_value, from_value.get<Float64>());
-        if (result_value.getType() == Field::Types::Decimal128)
-            return decimalEqualsFloat<Decimal128>(result_value, from_value.get<Float64>());
-        if (result_value.getType() == Field::Types::Decimal256)
-            return decimalEqualsFloat<Decimal256>(result_value, from_value.get<Float64>());
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown decimal type {}", result_value.getTypeName());
-    }
-    return true;
-}
-
 /// The `convertFieldToTypeStrict` is used to prevent unexpected results in case of conversion with loss of precision.
 /// Example: `SELECT 33.3 :: Decimal(9, 1) AS a WHERE a IN (33.33 :: Decimal(9, 2))`
 /// 33.33 in the set is converted to 33.3, but it is not equal to 33.3 in the column, so the result should still be empty.
@ -146,11 +114,10 @@ static Block createBlockFromCollection(const Collection & collection, const Data
    {
        if (columns_num == 1)
        {
-            Field field;
-            bool is_conversion_ok = convertFieldToTypeStrict(value, *types[0], field);
+            auto field = convertFieldToTypeStrict(value, *types[0]);
            bool need_insert_null = transform_null_in && types[0]->isNullable();
-            if (is_conversion_ok && (!field.isNull() || need_insert_null))
-                columns[0]->insert(field);
+            if (field && (!field->isNull() || need_insert_null))
+                columns[0]->insert(*field);
        }
        else
        {
@ -171,9 +138,10 @@ static Block createBlockFromCollection(const Collection & collection, const Data
            size_t i = 0;
            for (; i < tuple_size; ++i)
            {
-                bool is_conversion_ok = convertFieldToTypeStrict(tuple[i], *types[i], tuple_values[i]);
-                if (!is_conversion_ok)
+                auto converted_field = convertFieldToTypeStrict(tuple[i], *types[i]);
+                if (!converted_field)
                    break;
+                tuple_values[i] = std::move(*converted_field);

                bool need_insert_null = transform_null_in && types[i]->isNullable();
                if (tuple_values[i].isNull() && !need_insert_null)
--- a/src/Interpreters/Cache/Metadata.cpp
+++ b/src/Interpreters/Cache/Metadata.cpp
@ -62,11 +62,13 @@ KeyMetadata::KeyMetadata(
    CleanupQueue & cleanup_queue_,
    DownloadQueue & download_queue_,
    Poco::Logger * log_,
+    std::shared_mutex & key_prefix_directory_mutex_,
    bool created_base_directory_)
    : key(key_)
    , key_path(key_path_)
    , cleanup_queue(cleanup_queue_)
    , download_queue(download_queue_)
+    , key_prefix_directory_mutex(key_prefix_directory_mutex_)
    , created_base_directory(created_base_directory_)
    , log(log_)
 {
@ -102,16 +104,21 @@ bool KeyMetadata::createBaseDirectory()
    {
        try
        {
+            std::shared_lock lock(key_prefix_directory_mutex);
            fs::create_directories(key_path);
        }
-        catch (...)
+        catch (const fs::filesystem_error & e)
        {
-            /// Avoid errors like
-            /// std::__1::__fs::filesystem::filesystem_error: filesystem error: in create_directories: No space left on device
-            /// and mark file segment with SKIP_CACHE state
-            tryLogCurrentException(__PRETTY_FUNCTION__);
            created_base_directory = false;
-            return false;
+
+            if (e.code() == std::errc::no_space_on_device)
+            {
+                LOG_TRACE(log, "Failed to create base directory for key {}, "
+                          "because no space left on device", key);
+
+                return false;
+            }
+            throw;
        }
    }
    return true;
@ -200,7 +207,7 @@ LockedKeyPtr CacheMetadata::lockKeyMetadata(

            it = emplace(
                key, std::make_shared<KeyMetadata>(
-                    key, getPathForKey(key), *cleanup_queue, *download_queue, log, is_initial_load)).first;
+                    key, getPathForKey(key), *cleanup_queue, *download_queue, log, key_prefix_directory_mutex, is_initial_load)).first;
        }

        key_metadata = it->second;
@ -315,17 +322,10 @@ void CacheMetadata::doCleanup()

        try
        {
+            std::unique_lock mutex(key_prefix_directory_mutex);
            if (fs::exists(key_prefix_directory) && fs::is_empty(key_prefix_directory))
                fs::remove(key_prefix_directory);
        }
-        catch (const fs::filesystem_error & e)
-        {
-            /// Key prefix directory can become non-empty just now, it is expected.
-            if (e.code() == std::errc::directory_not_empty)
-                continue;
-            LOG_ERROR(log, "Error while removing key {}: {}", cleanup_key, getCurrentExceptionMessage(true));
-            chassert(false);
-        }
        catch (...)
        {
            LOG_ERROR(log, "Error while removing key {}: {}", cleanup_key, getCurrentExceptionMessage(true));
--- a/src/Interpreters/Cache/Metadata.h
+++ b/src/Interpreters/Cache/Metadata.h
@ -5,6 +5,7 @@
 #include <Interpreters/Cache/FileCacheKey.h>
 #include <Interpreters/Cache/FileSegment.h>
 #include <Interpreters/Cache/FileCache_fwd_internal.h>
+#include <shared_mutex>

 namespace DB
 {
@ -50,6 +51,7 @@ struct KeyMetadata : public std::map<size_t, FileSegmentMetadataPtr>,
        CleanupQueue & cleanup_queue_,
        DownloadQueue & download_queue_,
        Poco::Logger * log_,
+        std::shared_mutex & key_prefix_directory_mutex_,
        bool created_base_directory_ = false);

    enum class KeyState
@ -76,6 +78,7 @@ private:
    KeyGuard guard;
    CleanupQueue & cleanup_queue;
    DownloadQueue & download_queue;
+    std::shared_mutex & key_prefix_directory_mutex;
    std::atomic<bool> created_base_directory = false;
    Poco::Logger * log;
 };
@ -128,6 +131,7 @@ private:
    mutable CacheMetadataGuard guard;
    const CleanupQueuePtr cleanup_queue;
    const DownloadQueuePtr download_queue;
+    std::shared_mutex key_prefix_directory_mutex;
    Poco::Logger * log;

    void downloadImpl(FileSegment & file_segment, std::optional<Memory<>> & memory);
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@ -1130,9 +1130,17 @@ JoinPtr SelectQueryExpressionAnalyzer::makeJoin(

    if (auto storage = analyzed_join->getStorageJoin())
    {
+        auto joined_block_actions = analyzed_join->createJoinedBlockActions(getContext());
+        NamesWithAliases required_columns_with_aliases = analyzed_join->getRequiredColumns(
+            Block(joined_block_actions->getResultColumns()), joined_block_actions->getRequiredColumns().getNames());
+
+        Names original_right_column_names;
+        for (auto & pr : required_columns_with_aliases)
+            original_right_column_names.push_back(pr.first);
+
        auto right_columns = storage->getRightSampleBlock().getColumnsWithTypeAndName();
        std::tie(left_convert_actions, right_convert_actions) = analyzed_join->createConvertingActions(left_columns, right_columns);
-        return storage->getJoinLocked(analyzed_join, getContext());
+        return storage->getJoinLocked(analyzed_join, getContext(), original_right_column_names);
    }

    joined_plan = buildJoinedPlan(getContext(), join_element, *analyzed_join, query_options);
--- a/src/Interpreters/Session.cpp
+++ b/src/Interpreters/Session.cpp
@ -520,6 +520,8 @@ ContextMutablePtr Session::makeSessionContext()
        {},
        session_context->getSettingsRef().max_sessions_for_user);

+    recordLoginSucess(session_context);
+
    return session_context;
 }

@ -582,6 +584,8 @@ ContextMutablePtr Session::makeSessionContext(const String & session_name_, std:
        { session_name_ },
        max_sessions_for_user);

+    recordLoginSucess(session_context);
+
    return session_context;
 }

@ -655,24 +659,38 @@ ContextMutablePtr Session::makeQueryContextImpl(const ClientInfo * client_info_t
    if (user_id)
        user = query_context->getUser();

-    if (!notified_session_log_about_login)
-    {
-        if (auto session_log = getSessionLog())
-        {
-            session_log->addLoginSuccess(
-                    auth_id,
-                    named_session ? std::optional<std::string>(named_session->key.second) : std::nullopt,
-                    *query_context,
-                    user);
-
-            notified_session_log_about_login = true;
-        }
-    }
+    /// Interserver does not create session context
+    recordLoginSucess(query_context);

    return query_context;
 }


+void Session::recordLoginSucess(ContextPtr login_context) const
+{
+    if (notified_session_log_about_login)
+        return;
+
+    if (!login_context)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Session or query context must be created");
+
+    if (auto session_log = getSessionLog())
+    {
+        const auto & settings   = login_context->getSettingsRef();
+        const auto access       = login_context->getAccess();
+
+        session_log->addLoginSuccess(auth_id,
+                                     named_session ? named_session->key.second : "",
+                                     settings,
+                                     access,
+                                     getClientInfo(),
+                                     user);
+
+        notified_session_log_about_login = true;
+    }
+}
+
+
 void Session::releaseSessionID()
 {
    if (!named_session)
--- a/src/Interpreters/Session.h
+++ b/src/Interpreters/Session.h
@ -97,6 +97,8 @@ public:
 private:
    std::shared_ptr<SessionLog> getSessionLog() const;
    ContextMutablePtr makeQueryContextImpl(const ClientInfo * client_info_to_copy, ClientInfo * client_info_to_move) const;
+    void recordLoginSucess(ContextPtr login_context) const;
+

    mutable bool notified_session_log_about_login = false;
    const UUID auth_id;
--- a/src/Interpreters/SessionLog.cpp
+++ b/src/Interpreters/SessionLog.cpp
@ -199,12 +199,13 @@ void SessionLogElement::appendToBlock(MutableColumns & columns) const
    columns[i++]->insertData(auth_failure_reason.data(), auth_failure_reason.length());
 }

-void SessionLog::addLoginSuccess(const UUID & auth_id, std::optional<String> session_id, const Context & login_context, const UserPtr & login_user)
+void SessionLog::addLoginSuccess(const UUID & auth_id,
+                                 const String & session_id,
+                                 const Settings & settings,
+                                 const ContextAccessPtr & access,
+                                 const ClientInfo & client_info,
+                                 const UserPtr & login_user)
 {
-    const auto access = login_context.getAccess();
-    const auto & settings = login_context.getSettingsRef();
-    const auto & client_info = login_context.getClientInfo();
-
    DB::SessionLogElement log_entry(auth_id, SESSION_LOGIN_SUCCESS);
    log_entry.client_info = client_info;

@ -215,8 +216,7 @@ void SessionLog::addLoginSuccess(const UUID & auth_id, std::optional<String> ses
    }
    log_entry.external_auth_server = login_user ? login_user->auth_data.getLDAPServerName() : "";

-    if (session_id)
-        log_entry.session_id = *session_id;
+    log_entry.session_id = session_id;

    if (const auto roles_info = access->getRolesInfo())
        log_entry.roles = roles_info->getCurrentRolesNames();
--- a/src/Interpreters/SessionLog.h
+++ b/src/Interpreters/SessionLog.h
@ -20,6 +20,7 @@ enum SessionLogElementType : int8_t
 class ContextAccess;
 struct User;
 using UserPtr = std::shared_ptr<const User>;
+using ContextAccessPtr = std::shared_ptr<const ContextAccess>;

 /** A struct which will be inserted as row into session_log table.
  *
@ -72,7 +73,13 @@ class SessionLog : public SystemLog<SessionLogElement>
    using SystemLog<SessionLogElement>::SystemLog;

 public:
-    void addLoginSuccess(const UUID & auth_id, std::optional<String> session_id, const Context & login_context, const UserPtr & login_user);
+    void addLoginSuccess(const UUID & auth_id,
+                         const String & session_id,
+                         const Settings & settings,
+                         const ContextAccessPtr & access,
+                         const ClientInfo & client_info,
+                         const UserPtr & login_user);
+
    void addLoginFailure(const UUID & auth_id, const ClientInfo & info, const std::optional<String> & user, const Exception & reason);
    void addLogOut(const UUID & auth_id, const UserPtr & login_user, const ClientInfo & client_info);
 };
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@ -189,7 +189,6 @@ private:
    template <typename LeftNamesAndTypes, typename RightNamesAndTypes>
    void inferJoinKeyCommonType(const LeftNamesAndTypes & left, const RightNamesAndTypes & right, bool allow_right, bool strict);

-    NamesAndTypesList correctedColumnsAddedByJoin() const;

    void deduplicateAndQualifyColumnNames(const NameSet & left_table_columns, const String & right_table_prefix);

@ -371,6 +370,8 @@ public:
    bool isSpecialStorage() const { return !right_storage_name.empty() || right_storage_join || right_kv_storage; }

    std::shared_ptr<const IKeyValueEntity> getStorageKeyValue() { return right_kv_storage; }
+
+    NamesAndTypesList correctedColumnsAddedByJoin() const;
 };

 }
--- a/src/Interpreters/ThreadStatusExt.cpp
+++ b/src/Interpreters/ThreadStatusExt.cpp
@ -382,12 +382,10 @@ void ThreadStatus::finalizePerformanceCounters()
    updatePerformanceCounters();

    // We want to close perf file descriptors if the perf events were enabled for
-    // one query. What this code does in practice is less clear -- e.g., if I run
-    // 'select 1 settings metrics_perf_events_enabled = 1', I still get
-    // query_context->getSettingsRef().metrics_perf_events_enabled == 0 *shrug*.
+    // one query.
    bool close_perf_descriptors = true;
-    if (auto query_context_ptr = query_context.lock())
-        close_perf_descriptors = !query_context_ptr->getSettingsRef().metrics_perf_events_enabled;
+    if (auto global_context_ptr = global_context.lock())
+        close_perf_descriptors = !global_context_ptr->getSettingsRef().metrics_perf_events_enabled;

    try
    {
@ -410,7 +408,7 @@ void ThreadStatus::finalizePerformanceCounters()
            if (settings.log_queries && settings.log_query_threads)
            {
                const auto now = std::chrono::system_clock::now();
-                Int64 query_duration_ms = std::chrono::duration_cast<std::chrono::microseconds>(now - query_start_time.point).count();
+                Int64 query_duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now - query_start_time.point).count();
                if (query_duration_ms >= settings.log_queries_min_query_duration_ms.totalMilliseconds())
                {
                    if (auto thread_log = global_context_ptr->getQueryThreadLog())
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Subproject commit ee45796171324519f0c0bfd012018dd099296336`