From 2d3fb36a620ef755a3c9b6ed9993382f6e23d4b4 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 31 Aug 2022 19:34:35 +0200 Subject: [PATCH 01/24] Fix debug symbols (Add a quirk to force clang emit .debug_aranges with ThinLTO) Wrap a linker into a script that will add some settings (`-mllvm -generate-arange-section`) in case of ThinLTO to emit `.debug_aranges` symbols. Dicussion in the LLVM can be found here [1]. [1]: https://discourse.llvm.org/t/clang-does-not-produce-full-debug-aranges-section-with-thinlto/64898 Signed-off-by: Azat Khuzhin --- CMakeLists.txt | 2 ++ cmake/ld.lld.in | 17 +++++++++++++++++ cmake/tools.cmake | 9 +++++++-- .../02420_stracktrace_debug_symbols.reference | 1 + .../02420_stracktrace_debug_symbols.sh | 14 ++++++++++++++ 5 files changed, 41 insertions(+), 2 deletions(-) create mode 100755 cmake/ld.lld.in create mode 100644 tests/queries/0_stateless/02420_stracktrace_debug_symbols.reference create mode 100755 tests/queries/0_stateless/02420_stracktrace_debug_symbols.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index dbbec2a600d..64fb870b61b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,6 +143,8 @@ include (cmake/add_warning.cmake) if (COMPILER_CLANG) # generate ranges for fast "addr2line" search if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE") + # NOTE: that clang has a bug because of it does not emit .debug_aranges + # with ThinLTO, so custom ld.lld wrapper is shipped in docker images. set(COMPILER_FLAGS "${COMPILER_FLAGS} -gdwarf-aranges") endif () diff --git a/cmake/ld.lld.in b/cmake/ld.lld.in new file mode 100755 index 00000000000..9736dab1bc3 --- /dev/null +++ b/cmake/ld.lld.in @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +# This is a workaround for bug in llvm/clang, +# that does not produce .debug_aranges with LTO +# +# NOTE: this is a temporary solution, that should be removed once [1] will be +# resolved. +# +# [1]: https://discourse.llvm.org/t/clang-does-not-produce-full-debug-aranges-section-with-thinlto/64898/8 + +# NOTE: only -flto=thin is supported. +# NOTE: it is not possible to check was there -gdwarf-aranges initially or not. +if [[ "$*" =~ -plugin-opt=thinlto ]]; then + exec "@LLD_PATH@" -mllvm -generate-arange-section "$@" +else + exec "@LLD_PATH@" "$@" +fi diff --git a/cmake/tools.cmake b/cmake/tools.cmake index 5b005a6f1f9..57d39899a40 100644 --- a/cmake/tools.cmake +++ b/cmake/tools.cmake @@ -94,8 +94,13 @@ if (LINKER_NAME) if (NOT LLD_PATH) message (FATAL_ERROR "Using linker ${LINKER_NAME} but can't find its path.") endif () - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_PATH}") - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_PATH}") + + # This a temporary quirk to emit .debug_aranges with ThinLTO + set (LLD_WRAPPER "${CMAKE_CURRENT_BINARY_DIR}/ld.lld") + configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/cmake/ld.lld.in" "${LLD_WRAPPER}" @ONLY) + + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}") + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}") else () set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}") set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}") diff --git a/tests/queries/0_stateless/02420_stracktrace_debug_symbols.reference b/tests/queries/0_stateless/02420_stracktrace_debug_symbols.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02420_stracktrace_debug_symbols.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02420_stracktrace_debug_symbols.sh b/tests/queries/0_stateless/02420_stracktrace_debug_symbols.sh new file mode 100755 index 00000000000..9b647ec984b --- /dev/null +++ b/tests/queries/0_stateless/02420_stracktrace_debug_symbols.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# NOTE: that this test uses stacktrace instead of addressToLineWithInlines() or +# similar, since that code (use / might use) different code path in Dwarf +# parser. +# +# Also note, that to rely on this test one should assume that CI packages uses +# ThinLTO builds. + +$CLICKHOUSE_LOCAL --stacktrace -q 'select throwIf(1)' |& grep -c 'Common/Exception.cpp:[0-9]*: DB::Exception::Exception' From 45afaa6fb8b5166e0770e7e54525af0e810323fd Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 1 Sep 2022 18:31:22 +0200 Subject: [PATCH 02/24] Fix loading external symbols Signed-off-by: Azat Khuzhin --- src/Common/SymbolIndex.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Common/SymbolIndex.cpp b/src/Common/SymbolIndex.cpp index 46d9e8dbd5c..c02d17a381e 100644 --- a/src/Common/SymbolIndex.cpp +++ b/src/Common/SymbolIndex.cpp @@ -37,7 +37,7 @@ But because ClickHouse is linked with most of the symbols exported (-rdynamic fl It allows to get source file names and line numbers from addresses. Only available if you use -g option for compiler. It is also used by default for ClickHouse builds, but because of its weight (about two gigabytes) it is split to separate binary and provided in clickhouse-common-static-dbg package. -This separate binary is placed in /usr/lib/debug/usr/bin/clickhouse and is loaded automatically by tools like gdb, addr2line. +This separate binary is placed in /usr/lib/debug/usr/bin/clickhouse.debug and is loaded automatically by tools like gdb, addr2line. When you build ClickHouse by yourself, debug info is not split and present in a single huge binary. What ClickHouse is using to provide good stack traces? @@ -391,6 +391,7 @@ void collectSymbolsFromELF( std::filesystem::path local_debug_info_path = canonical_path.parent_path() / canonical_path.stem(); local_debug_info_path += ".debug"; std::filesystem::path debug_info_path = std::filesystem::path("/usr/lib/debug") / canonical_path.relative_path(); + debug_info_path += ".debug"; if (std::filesystem::exists(local_debug_info_path)) object_name = local_debug_info_path; From 26f1c1504a6291403181a17df9f1665585da3b6f Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 1 Sep 2022 18:52:48 +0200 Subject: [PATCH 03/24] Do not compress debug sections (internal DWARF parser cannot handle such) Although this increase debug symbol size from 510MB to 1.8GB, but it is not a problem for packages, since they are compressed anyway. Checked deb package, and size slightly increased though, 834M -> 962M. Signed-off-by: Azat Khuzhin --- cmake/split_debug_symbols.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/split_debug_symbols.cmake b/cmake/split_debug_symbols.cmake index 12182ed9c20..a9c2158359a 100644 --- a/cmake/split_debug_symbols.cmake +++ b/cmake/split_debug_symbols.cmake @@ -20,7 +20,7 @@ macro(clickhouse_split_debug_symbols) COMMAND mkdir -p "${STRIP_DESTINATION_DIR}/bin" COMMAND cp "${STRIP_BINARY_PATH}" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" # Splits debug symbols into separate file, leaves the binary untouched: - COMMAND "${OBJCOPY_PATH}" --only-keep-debug --compress-debug-sections "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" + COMMAND "${OBJCOPY_PATH}" --only-keep-debug "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" # Strips binary, sections '.note' & '.comment' are removed in line with Debian's stripping policy: www.debian.org/doc/debian-policy/ch-files.html, section '.clickhouse.hash' is needed for integrity check: COMMAND "${STRIP_PATH}" --remove-section=.comment --remove-section=.note --keep-section=.clickhouse.hash "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" From 597197c01d0c7bd7c132133ebe2d210d9ab6609c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 2 Sep 2022 09:03:39 +0200 Subject: [PATCH 04/24] Improve error messages in Elf loader Signed-off-by: Azat Khuzhin --- src/Common/Elf.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Common/Elf.cpp b/src/Common/Elf.cpp index b735367b179..0515cc0765a 100644 --- a/src/Common/Elf.cpp +++ b/src/Common/Elf.cpp @@ -22,13 +22,13 @@ Elf::Elf(const std::string & path) /// Check if it's an elf. elf_size = in.buffer().size(); if (elf_size < sizeof(ElfEhdr)) - throw Exception("The size of supposedly ELF file is too small", ErrorCodes::CANNOT_PARSE_ELF); + throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The size of supposedly ELF file '{}' is too small", path); mapped = in.buffer().begin(); header = reinterpret_cast(mapped); if (memcmp(header->e_ident, "\x7F""ELF", 4) != 0) - throw Exception("The file is not ELF according to magic", ErrorCodes::CANNOT_PARSE_ELF); + throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The file '{}' is not ELF according to magic", path); /// Get section header. ElfOff section_header_offset = header->e_shoff; @@ -37,7 +37,7 @@ Elf::Elf(const std::string & path) if (!section_header_offset || !section_header_num_entries || section_header_offset + section_header_num_entries * sizeof(ElfShdr) > elf_size) - throw Exception("The ELF is truncated (section header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF); + throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (section header points after end of file)", path); section_headers = reinterpret_cast(mapped + section_header_offset); @@ -48,11 +48,11 @@ Elf::Elf(const std::string & path) }); if (!section_names_strtab) - throw Exception("The ELF doesn't have string table with section names", ErrorCodes::CANNOT_PARSE_ELF); + throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' doesn't have string table with section names", path); ElfOff section_names_offset = section_names_strtab->header.sh_offset; if (section_names_offset >= elf_size) - throw Exception("The ELF is truncated (section names string table points after end of file)", ErrorCodes::CANNOT_PARSE_ELF); + throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (section names string table points after end of file)", path); section_names = reinterpret_cast(mapped + section_names_offset); @@ -64,7 +64,7 @@ Elf::Elf(const std::string & path) if (!program_header_offset || !program_header_num_entries || program_header_offset + program_header_num_entries * sizeof(ElfPhdr) > elf_size) - throw Exception("The ELF is truncated (program header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF); + throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (program header points after end of file)", path); program_headers = reinterpret_cast(mapped + program_header_offset); } From 9479e2143fa42d6ec7fb6703b3483a054a58cf7a Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 2 Sep 2022 14:37:54 +0200 Subject: [PATCH 05/24] Do not try to load empty debug files This will avoid CANNOT_PARSE_ELF error for builds that has empty debug file in clickhouse-common-static-dbg package, i.e. debug build. Signed-off-by: Azat Khuzhin --- src/Common/SymbolIndex.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/Common/SymbolIndex.cpp b/src/Common/SymbolIndex.cpp index c02d17a381e..e217d23cc27 100644 --- a/src/Common/SymbolIndex.cpp +++ b/src/Common/SymbolIndex.cpp @@ -393,9 +393,20 @@ void collectSymbolsFromELF( std::filesystem::path debug_info_path = std::filesystem::path("/usr/lib/debug") / canonical_path.relative_path(); debug_info_path += ".debug"; - if (std::filesystem::exists(local_debug_info_path)) + /// NOTE: This is a workaround for current package system. + /// + /// Since nfpm cannot copy file only if it exists, + /// and so in cmake empty .debug file is created instead, + /// but if we will try to load empty Elf file, then the CANNOT_PARSE_ELF + /// exception will be thrown from the Elf::Elf. + auto exists_not_empty = [](const std::filesystem::path & path) + { + return std::filesystem::exists(path) && !std::filesystem::is_empty(path); + }; + + if (exists_not_empty(local_debug_info_path)) object_name = local_debug_info_path; - else if (std::filesystem::exists(debug_info_path)) + else if (exists_not_empty(debug_info_path)) object_name = debug_info_path; else if (build_id.size() >= 2) { @@ -413,7 +424,7 @@ void collectSymbolsFromELF( std::filesystem::path build_id_debug_info_path( fmt::format("/usr/lib/debug/.build-id/{}/{}.debug", build_id_hex.substr(0, 2), build_id_hex.substr(2))); - if (std::filesystem::exists(build_id_debug_info_path)) + if (exists_not_empty(build_id_debug_info_path)) object_name = build_id_debug_info_path; else object_name = canonical_path; From c6cbd981b6f97db2c9b5ffd42495ba2fa698ff3d Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 2 Sep 2022 23:20:51 +0200 Subject: [PATCH 06/24] tests: disable 02161_addressToLineWithInlines addressToLineWithInlines() may lead to the following error: Code: 465. DB::Exception: Received from localhost:9000. DB::Exception: could not find abbreviation code: while executing 'FUNCTION addressToLineWithInlines(arrayJoin(trace) :: 1) -> addressToLineWithInlines(arrayJoin(trace)) Array(String) : 0'. (CANNOT_PARSE_DWARF) CI: https://s3.amazonaws.com/clickhouse-test-reports/40873/45fd2bcb218ace3231a026eb91d688f0093c6407/stateless_tests__release_.html Signed-off-by: Azat Khuzhin --- tests/queries/0_stateless/02161_addressToLineWithInlines.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02161_addressToLineWithInlines.sql b/tests/queries/0_stateless/02161_addressToLineWithInlines.sql index b6b497b4b55..e4624fffd48 100644 --- a/tests/queries/0_stateless/02161_addressToLineWithInlines.sql +++ b/tests/queries/0_stateless/02161_addressToLineWithInlines.sql @@ -1,4 +1,5 @@ --- Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-cpu-aarch64 +-- Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-cpu-aarch64, disabled +-- Tag disabled: Parsing inlines may lead to "could not find abbreviation code" (FIXME) SET allow_introspection_functions = 0; SELECT addressToLineWithInlines(1); -- { serverError 446 } From 8c3c3e766751ea9a1765b6f695b58446e5135644 Mon Sep 17 00:00:00 2001 From: Vladimir C Date: Mon, 5 Sep 2022 12:39:39 +0200 Subject: [PATCH 07/24] Minor update doc for mysql_port --- .../en/operations/server-configuration-parameters/settings.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 24e08fe1fcd..b7fe7d49b7b 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1452,7 +1452,7 @@ Port for communicating with clients over MySQL protocol. **Possible values** -Positive integer. +Positive integer to specify the port number to listen to or empty value to disable. Example @@ -1466,7 +1466,7 @@ Port for communicating with clients over PostgreSQL protocol. **Possible values** -Positive integer. +Positive integer to specify the port number to listen to or empty value to disable. Example From e1def723f8dea97b5fe71c09fcf15131e539d48c Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 5 Sep 2022 12:04:10 +0000 Subject: [PATCH 08/24] Add special x86-SSE2-only build --- .github/workflows/master.yml | 48 ++++++++++++++++++++++++++++++ .github/workflows/pull_request.yml | 46 ++++++++++++++++++++++++++++ cmake/cpu_features.cmake | 17 +++++++++++ docker/packager/packager | 4 +++ tests/ci/ci_config.py | 11 +++++++ 5 files changed, 126 insertions(+) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index e8e3deceef5..d3a303eb7ab 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -923,6 +923,53 @@ jobs: # shellcheck disable=SC2046 docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + BuilderBinAmd64SSE2: + needs: [DockerHubPush] + runs-on: [self-hosted, builder] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + BUILD_NAME=binary_amd64sse2 + EOF + - name: Download changed images + uses: actions/download-artifact@v2 + with: + name: changed_images + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + with: + fetch-depth: 0 # otherwise we will have no info about contributors + - name: Build + run: | + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" + - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} + uses: actions/upload-artifact@v2 + with: + name: ${{ env.BUILD_URLS }} + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json + - name: Cleanup + if: always() + run: | + # shellcheck disable=SC2046 + docker kill $(docker ps -q) ||: + # shellcheck disable=SC2046 + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" ############################################################################################ ##################################### Docker images ####################################### ############################################################################################ @@ -1011,6 +1058,7 @@ jobs: - BuilderBinFreeBSD # - BuilderBinGCC - BuilderBinPPC64 + - BuilderBinAmd64SSE2 - BuilderBinClangTidy - BuilderDebShared runs-on: [self-hosted, style-checker] diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 7901008a7db..c100b079ed5 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -935,6 +935,51 @@ jobs: # shellcheck disable=SC2046 docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + BuilderBinAmd64SSE2: + needs: [DockerHubPush, FastTest, StyleCheck] + runs-on: [self-hosted, builder] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + BUILD_NAME=binary_amd64sse2 + EOF + - name: Download changed images + uses: actions/download-artifact@v2 + with: + name: changed_images + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Build + run: | + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" + - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} + uses: actions/upload-artifact@v2 + with: + name: ${{ env.BUILD_URLS }} + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json + - name: Cleanup + if: always() + run: | + # shellcheck disable=SC2046 + docker kill $(docker ps -q) ||: + # shellcheck disable=SC2046 + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" ############################################################################################ ##################################### Docker images ####################################### ############################################################################################ @@ -1023,6 +1068,7 @@ jobs: - BuilderBinFreeBSD # - BuilderBinGCC - BuilderBinPPC64 + - BuilderBinAmd64SSE2 - BuilderBinClangTidy - BuilderDebShared runs-on: [self-hosted, style-checker] diff --git a/cmake/cpu_features.cmake b/cmake/cpu_features.cmake index 1fc3c2db804..218b4deedce 100644 --- a/cmake/cpu_features.cmake +++ b/cmake/cpu_features.cmake @@ -24,6 +24,23 @@ option (ENABLE_BMI "Use BMI instructions on x86_64" 0) option (ENABLE_AVX2_FOR_SPEC_OP "Use avx2 instructions for specific operations on x86_64" 0) option (ENABLE_AVX512_FOR_SPEC_OP "Use avx512 instructions for specific operations on x86_64" 0) +# X86: Allow compilation for a SSE2-only target machine. Done by a special build in CI for embedded or very old hardware. +option (NO_SSE3_OR_HIGHER "Disable SSE3 or higher on x86_64" 0) +if (NO_SSE3_OR_HIGHER) + SET(ENABLE_SSSE3 0) + SET(ENABLE_SSE41 0) + SET(ENABLE_SSE42 0) + SET(ENABLE_PCLMULQDQ 0) + SET(ENABLE_POPCNT 0) + SET(ENABLE_AVX 0) + SET(ENABLE_AVX2 0) + SET(ENABLE_AVX512 0) + SET(ENABLE_AVX512_VBMI 0) + SET(ENABLE_BMI 0) + SET(ENABLE_AVX2_FOR_SPEC_OP 0) + SET(ENABLE_AVX512_FOR_SPEC_OP 0) +endif() + option (ARCH_NATIVE "Add -march=native compiler flag. This makes your binaries non-portable but more performant code may be generated. This option overrides ENABLE_* options for specific instruction set. Highly not recommended to use." 0) if (ARCH_NATIVE) diff --git a/docker/packager/packager b/docker/packager/packager index 66eb568d460..f878444d4bc 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -190,6 +190,9 @@ def parse_env_variables( cc = compiler result.append("DEB_ARCH=amd64") + if compiler.endswith("-amd64sse2"): + cmake_flags.append('-DNO_SSE3_OR_HIGHER=1') + cxx = cc.replace("gcc", "g++").replace("clang", "clang++") if package_type == "deb": @@ -339,6 +342,7 @@ if __name__ == "__main__": "clang-14-darwin-aarch64", "clang-14-aarch64", "clang-14-ppc64le", + "clang-14-amd64sse2", "clang-14-freebsd", "gcc-11", ), diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 3d0513bca47..b49e91a9e79 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -161,6 +161,16 @@ CI_CONFIG = { "tidy": "disable", "with_coverage": False, }, + "binary_amd64sse2": { + "compiler": "clang-14-amd64sse2", + "build_type": "", + "sanitizer": "", + "package_type": "binary", + "static_binary_name": "amd64sse2", + "libraries": "static", + "tidy": "disable", + "with_coverage": False, + }, }, "builds_report_config": { "ClickHouse build check": [ @@ -182,6 +192,7 @@ CI_CONFIG = { "binary_freebsd", "binary_darwin_aarch64", "binary_ppc64le", + "binary_amd64sse2", ], }, "tests_config": { From 438ed368a1e56af77367b42f411b099992c5ba38 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 5 Sep 2022 17:49:00 +0000 Subject: [PATCH 09/24] fix: correct compiler parsing --- docker/packager/packager | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docker/packager/packager b/docker/packager/packager index f878444d4bc..363be9ab2dd 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -130,6 +130,7 @@ def parse_env_variables( ARM_SUFFIX = "-aarch64" FREEBSD_SUFFIX = "-freebsd" PPC_SUFFIX = "-ppc64le" + AMD64_SSE2_SUFFIX = "-amd64sse2" result = [] result.append("OUTPUT_DIR=/output") @@ -141,6 +142,7 @@ def parse_env_variables( is_cross_arm = compiler.endswith(ARM_SUFFIX) is_cross_ppc = compiler.endswith(PPC_SUFFIX) is_cross_freebsd = compiler.endswith(FREEBSD_SUFFIX) + is_amd64_sse2 = compiler.endswidth(AMD64_SSE2_SUFFIX) if is_cross_darwin: cc = compiler[: -len(DARWIN_SUFFIX)] @@ -186,13 +188,13 @@ def parse_env_variables( cmake_flags.append( "-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-ppc64le.cmake" ) + elif is_amd64_sse2: + cc = compiler[: -len(AMD64_SSE2_SUFFIX)] + result.append("DEB_ARCH=amd64") else: cc = compiler result.append("DEB_ARCH=amd64") - if compiler.endswith("-amd64sse2"): - cmake_flags.append('-DNO_SSE3_OR_HIGHER=1') - cxx = cc.replace("gcc", "g++").replace("clang", "clang++") if package_type == "deb": From 83514fa2ef3e0f8b1465622b5f162f67997790b2 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 5 Sep 2022 18:55:00 +0200 Subject: [PATCH 10/24] Refactor --- src/Common/FieldVisitorToString.cpp | 6 ++ src/Common/FieldVisitorToString.h | 5 +- .../getDictionaryConfigurationFromAST.cpp | 19 ++---- src/Disks/IDisk.h | 11 ++- .../ObjectStorages/DiskObjectStorage.cpp | 2 +- .../DiskObjectStorageMetadata.cpp | 8 +++ .../DiskObjectStorageMetadata.h | 1 + .../QueryPlan/ReadFromMergeTree.cpp | 1 - src/Storages/MergeTree/AlterConversions.h | 24 +++++++ .../IMergeTreeDataPartInfoForReader.h | 68 +++++++++++++++++++ src/Storages/MergeTree/IMergeTreeReader.cpp | 29 ++++---- src/Storages/MergeTree/IMergeTreeReader.h | 9 +-- .../LoadedMergeTreeDataPartInfoForReader.h | 55 +++++++++++++++ src/Storages/MergeTree/MarkRange.cpp | 12 ++++ src/Storages/MergeTree/MarkRange.h | 2 + .../MergeTreeBaseSelectProcessor.cpp | 53 +++++++++++---- .../MergeTree/MergeTreeBaseSelectProcessor.h | 14 ++++ .../MergeTree/MergeTreeBlockReadUtils.cpp | 50 ++++++++------ .../MergeTree/MergeTreeBlockReadUtils.h | 15 ++-- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- src/Storages/MergeTree/MergeTreeData.h | 15 +--- .../MergeTree/MergeTreeDataPartCompact.cpp | 42 +++++++----- .../MergeTree/MergeTreeDataPartCompact.h | 5 ++ .../MergeTree/MergeTreeDataPartInMemory.cpp | 4 +- .../MergeTree/MergeTreeDataPartWide.cpp | 47 +++++++------ .../MergeTree/MergeTreeDataPartWide.h | 5 ++ .../MergeTree/MergeTreeIndexGranularityInfo.h | 2 + .../MergeTree/MergeTreeRangeReader.cpp | 9 +-- src/Storages/MergeTree/MergeTreeReadPool.cpp | 5 +- src/Storages/MergeTree/MergeTreeReadPool.h | 14 ++-- .../MergeTree/MergeTreeReaderCompact.cpp | 42 ++++++------ .../MergeTree/MergeTreeReaderCompact.h | 4 +- .../MergeTree/MergeTreeReaderInMemory.cpp | 5 +- .../MergeTree/MergeTreeReaderInMemory.h | 1 + .../MergeTree/MergeTreeReaderWide.cpp | 24 +++---- src/Storages/MergeTree/MergeTreeReaderWide.h | 2 +- .../MergeTree/MergeTreeSelectProcessor.cpp | 3 +- .../MergeTree/MergeTreeSequentialSource.cpp | 3 +- .../System/StorageSystemRemoteDataPaths.cpp | 11 ++- 39 files changed, 444 insertions(+), 185 deletions(-) create mode 100644 src/Storages/MergeTree/AlterConversions.h create mode 100644 src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h create mode 100644 src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h diff --git a/src/Common/FieldVisitorToString.cpp b/src/Common/FieldVisitorToString.cpp index 7d93cfba78f..e0e138d744c 100644 --- a/src/Common/FieldVisitorToString.cpp +++ b/src/Common/FieldVisitorToString.cpp @@ -145,5 +145,11 @@ String FieldVisitorToString::operator() (const Object & x) const } +String convertFieldToString(const Field & field) +{ + if (field.getType() == Field::Types::Which::String) + return field.get(); + return applyVisitor(FieldVisitorToString(), field); } +} diff --git a/src/Common/FieldVisitorToString.h b/src/Common/FieldVisitorToString.h index 324a4aa73d5..cca29a8f7e0 100644 --- a/src/Common/FieldVisitorToString.h +++ b/src/Common/FieldVisitorToString.h @@ -31,5 +31,8 @@ public: String operator() (const bool & x) const; }; -} +/// Get value from field and convert it to string. +/// Also remove quotes from strings. +String convertFieldToString(const Field & field); +} diff --git a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp index 7d8253c47ce..e19495a27a3 100644 --- a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp +++ b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp @@ -44,15 +44,6 @@ struct AttributeConfiguration using AttributeNameToConfiguration = std::unordered_map; -/// Get value from field and convert it to string. -/// Also remove quotes from strings. -String getFieldAsString(const Field & field) -{ - if (field.getType() == Field::Types::Which::String) - return field.get(); - return applyVisitor(FieldVisitorToString(), field); -} - String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_attr) { if (!dict_attr->expression) @@ -61,7 +52,7 @@ String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_att /// EXPRESSION PROPERTY should be expression or string String expression_str; if (const auto * literal = dict_attr->expression->as(); literal && literal->value.getType() == Field::Types::String) - expression_str = getFieldAsString(literal->value); + expression_str = convertFieldToString(literal->value); else expression_str = queryToString(dict_attr->expression); @@ -275,7 +266,7 @@ void buildSingleAttribute( AutoPtr null_value_element(doc->createElement("null_value")); String null_value_str; if (dict_attr->default_value) - null_value_str = getFieldAsString(dict_attr->default_value->as()->value); + null_value_str = convertFieldToString(dict_attr->default_value->as()->value); AutoPtr null_value(doc->createTextNode(null_value_str)); null_value_element->appendChild(null_value); attribute_element->appendChild(null_value_element); @@ -452,7 +443,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments( } else if (const auto * literal = pair->second->as()) { - AutoPtr value(doc->createTextNode(getFieldAsString(literal->value))); + AutoPtr value(doc->createTextNode(convertFieldToString(literal->value))); current_xml_element->appendChild(value); } else if (const auto * list = pair->second->as()) @@ -473,7 +464,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments( Field value; result->get(0, value); - AutoPtr text_value(doc->createTextNode(getFieldAsString(value))); + AutoPtr text_value(doc->createTextNode(convertFieldToString(value))); current_xml_element->appendChild(text_value); } else @@ -519,7 +510,7 @@ void buildSourceConfiguration( { AutoPtr setting_change_element(doc->createElement(name)); settings_element->appendChild(setting_change_element); - AutoPtr setting_value(doc->createTextNode(getFieldAsString(value))); + AutoPtr setting_value(doc->createTextNode(convertFieldToString(value))); setting_change_element->appendChild(setting_value); } } diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index bfbdba0e050..81f33b27056 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -239,7 +239,16 @@ public: } /// For one local path there might be multiple remote paths in case of Log family engines. - using LocalPathWithObjectStoragePaths = std::pair; + struct LocalPathWithObjectStoragePaths + { + std::string local_path; + std::string common_prefix_for_objects; + StoredObjects objects; + + LocalPathWithObjectStoragePaths( + const std::string & local_path_, const std::string & common_prefix_for_objects_, StoredObjects && objects_) + : local_path(local_path_), common_prefix_for_objects(common_prefix_for_objects_), objects(std::move(objects_)) {} + }; virtual void getRemotePathsRecursive(const String &, std::vector &) { diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index 4e9dea7f481..913fd76bf8a 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -127,7 +127,7 @@ void DiskObjectStorage::getRemotePathsRecursive(const String & local_path, std:: { try { - paths_map.emplace_back(local_path, getStorageObjects(local_path)); + paths_map.emplace_back(local_path, metadata_storage->getObjectStorageRootPath(), getStorageObjects(local_path)); } catch (const Exception & e) { diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp index ae03915d944..f18debe8a8b 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp @@ -68,6 +68,14 @@ void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf) } } +void DiskObjectStorageMetadata::createFromSingleObject(const std::string & relative_path, size_t bytes_size, size_t ref_count_, bool read_only_) +{ + storage_objects.emplace_back(relative_path, bytes_size); + total_size = bytes_size; + ref_count = ref_count_; + read_only = read_only_; +} + void DiskObjectStorageMetadata::deserializeFromString(const std::string & data) { ReadBufferFromString buf(data); diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h index d3ea5795dd3..09e0f4ee85b 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h +++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h @@ -50,6 +50,7 @@ public: void deserialize(ReadBuffer & buf); void deserializeFromString(const std::string & data); + void createFromSingleObject(const std::string & relative_path, size_t bytes_size, size_t ref_count_, bool is_read_only_); void serialize(WriteBuffer & buf, bool sync) const; std::string serializeToString() const; diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 6cae86c9717..60bf8d6a15c 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -179,7 +179,6 @@ Pipe ReadFromMergeTree::readFromPool( sum_marks, min_marks_for_concurrent_read, std::move(parts_with_range), - data, storage_snapshot, prewhere_info, required_columns, diff --git a/src/Storages/MergeTree/AlterConversions.h b/src/Storages/MergeTree/AlterConversions.h new file mode 100644 index 00000000000..0d58499d424 --- /dev/null +++ b/src/Storages/MergeTree/AlterConversions.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +/// Alter conversions which should be applied on-fly for part. Build from of +/// the most recent mutation commands for part. Now we have only rename_map +/// here (from ALTER_RENAME) command, because for all other type of alters +/// we can deduce conversions for part from difference between +/// part->getColumns() and storage->getColumns(). +struct AlterConversions +{ + /// Rename map new_name -> old_name + std::unordered_map rename_map; + + bool isColumnRenamed(const std::string & new_name) const { return rename_map.count(new_name) > 0; } + std::string getColumnOldName(const std::string & new_name) const { return rename_map.at(new_name); } +}; + +} diff --git a/src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h b/src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h new file mode 100644 index 00000000000..28f834d661d --- /dev/null +++ b/src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h @@ -0,0 +1,68 @@ +#pragma once +#include +#include +#include + +namespace DB +{ + +class IDataPartStorage; +using DataPartStoragePtr = std::shared_ptr; +class MergeTreeIndexGranularity; +struct MergeTreeDataPartChecksums; +struct MergeTreeIndexGranularityInfo; +class ISerialization; +using SerializationPtr = std::shared_ptr; + +/** + * A class which contains all information about a data part that is required + * in order to use MergeTreeDataPartReader's. + * It is a separate interface and not a simple struct because + * otherwise it will need to copy all the information which might not + * be even used (for example, an IndexGranulary class object is quite heavy). + */ +class IMergeTreeDataPartInfoForReader : public WithContext +{ +public: + explicit IMergeTreeDataPartInfoForReader(ContextPtr context_) : WithContext(context_) {} + + virtual ~IMergeTreeDataPartInfoForReader() = default; + + virtual bool isCompactPart() const = 0; + + virtual bool isWidePart() const = 0; + + virtual bool isInMemoryPart() const = 0; + + virtual bool isProjectionPart() const = 0; + + virtual const DataPartStoragePtr & getDataPartStorage() const = 0; + + virtual const NamesAndTypesList & getColumns() const = 0; + + virtual std::optional getColumnPosition(const String & column_name) const = 0; + + virtual String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const = 0; + + virtual const MergeTreeDataPartChecksums & getChecksums() const = 0; + + virtual AlterConversions getAlterConversions() const = 0; + + virtual size_t getMarksCount() const = 0; + + virtual size_t getFileSizeOrZero(const std::string & file_name) const = 0; + + virtual const MergeTreeIndexGranularityInfo & getIndexGranularityInfo() const = 0; + + virtual const MergeTreeIndexGranularity & getIndexGranularity() const = 0; + + virtual SerializationPtr getSerialization(const NameAndTypePair & column) const = 0; + + virtual const SerializationInfoByName & getSerializationInfos() const = 0; + + virtual void reportBroken() = 0; +}; + +using MergeTreeDataPartInfoForReaderPtr = std::shared_ptr; + +} diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp index 8c861248580..851b0378e6f 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.cpp +++ b/src/Storages/MergeTree/IMergeTreeReader.cpp @@ -23,7 +23,7 @@ namespace ErrorCodes IMergeTreeReader::IMergeTreeReader( - const MergeTreeData::DataPartPtr & data_part_, + MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_, const NamesAndTypesList & columns_, const StorageMetadataPtr & metadata_snapshot_, UncompressedCache * uncompressed_cache_, @@ -31,19 +31,18 @@ IMergeTreeReader::IMergeTreeReader( const MarkRanges & all_mark_ranges_, const MergeTreeReaderSettings & settings_, const ValueSizeMap & avg_value_size_hints_) - : data_part(data_part_) + : data_part_info_for_read(data_part_info_for_read_) , avg_value_size_hints(avg_value_size_hints_) , uncompressed_cache(uncompressed_cache_) , mark_cache(mark_cache_) , settings(settings_) - , storage(data_part_->storage) , metadata_snapshot(metadata_snapshot_) , all_mark_ranges(all_mark_ranges_) - , alter_conversions(storage.getAlterConversionsForPart(data_part)) + , alter_conversions(data_part_info_for_read->getAlterConversions()) /// For wide parts convert plain arrays of Nested to subcolumns /// to allow to use shared offset column from cache. - , requested_columns(isWidePart(data_part) ? Nested::convertToSubcolumns(columns_) : columns_) - , part_columns(isWidePart(data_part) ? Nested::collect(data_part->getColumns()) : data_part->getColumns()) + , requested_columns(data_part_info_for_read->isWidePart() ? Nested::convertToSubcolumns(columns_) : columns_) + , part_columns(data_part_info_for_read->isWidePart() ? Nested::collect(data_part_info_for_read->getColumns()) : data_part_info_for_read->getColumns()) { columns_to_read.reserve(requested_columns.size()); serializations.reserve(requested_columns.size()); @@ -71,7 +70,7 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e catch (Exception & e) { /// Better diagnostics. - e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")"); + e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")"); throw; } } @@ -99,13 +98,13 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns } auto dag = DB::evaluateMissingDefaults( - additional_columns, requested_columns, metadata_snapshot->getColumns(), storage.getContext()); + additional_columns, requested_columns, metadata_snapshot->getColumns(), data_part_info_for_read->getContext()); if (dag) { dag->addMaterializingOutputActions(); auto actions = std::make_shared< ExpressionActions>(std::move(dag), - ExpressionActionsSettings::fromSettings(storage.getContext()->getSettingsRef())); + ExpressionActionsSettings::fromSettings(data_part_info_for_read->getContext()->getSettingsRef())); actions->execute(additional_columns); } @@ -117,7 +116,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns catch (Exception & e) { /// Better diagnostics. - e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")"); + e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")"); throw; } } @@ -151,7 +150,7 @@ SerializationPtr IMergeTreeReader::getSerializationInPart(const NameAndTypePair if (!column_in_part) return IDataType::getSerialization(required_column); - const auto & infos = data_part->getSerializationInfos(); + const auto & infos = data_part_info_for_read->getSerializationInfos(); if (auto it = infos.find(column_in_part->getNameInStorage()); it != infos.end()) return IDataType::getSerialization(*column_in_part, *it->second); @@ -187,7 +186,7 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const copy_block.insert({res_columns[pos], getColumnInPart(*name_and_type).type, name_and_type->name}); } - DB::performRequiredConversions(copy_block, requested_columns, storage.getContext()); + DB::performRequiredConversions(copy_block, requested_columns, data_part_info_for_read->getContext()); /// Move columns from block. name_and_type = requested_columns.begin(); @@ -197,7 +196,7 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const catch (Exception & e) { /// Better diagnostics. - e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")"); + e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")"); throw; } } @@ -205,11 +204,11 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const String & column_name) const { String table_name = Nested::extractTableName(column_name); - for (const auto & part_column : data_part->getColumns()) + for (const auto & part_column : data_part_info_for_read->getColumns()) { if (typeid_cast(part_column.type.get())) { - auto position = data_part->getColumnPosition(part_column.getNameInStorage()); + auto position = data_part_info_for_read->getColumnPosition(part_column.getNameInStorage()); if (position && Nested::extractTableName(part_column.name) == table_name) return position; } diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h index 453563522a5..f88f916908f 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.h +++ b/src/Storages/MergeTree/IMergeTreeReader.h @@ -4,6 +4,8 @@ #include #include #include +#include +#include namespace DB { @@ -20,7 +22,7 @@ public: using DeserializeBinaryBulkStateMap = std::map; IMergeTreeReader( - const MergeTreeData::DataPartPtr & data_part_, + MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_, const NamesAndTypesList & columns_, const StorageMetadataPtr & metadata_snapshot_, UncompressedCache * uncompressed_cache_, @@ -57,7 +59,7 @@ public: size_t getFirstMarkToRead() const { return all_mark_ranges.front().begin; } - MergeTreeData::DataPartPtr data_part; + MergeTreeDataPartInfoForReaderPtr data_part_info_for_read; protected: /// Returns actual column name in part, which can differ from table metadata. @@ -86,7 +88,6 @@ protected: MergeTreeReaderSettings settings; - const MergeTreeData & storage; StorageMetadataPtr metadata_snapshot; MarkRanges all_mark_ranges; @@ -95,7 +96,7 @@ protected: private: /// Alter conversions, which must be applied on fly if required - MergeTreeData::AlterConversions alter_conversions; + AlterConversions alter_conversions; /// Columns that are requested to read. NamesAndTypesList requested_columns; diff --git a/src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h b/src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h new file mode 100644 index 00000000000..a16aaa728ae --- /dev/null +++ b/src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h @@ -0,0 +1,55 @@ +#pragma once +#include +#include + + +namespace DB +{ + +class LoadedMergeTreeDataPartInfoForReader final : public IMergeTreeDataPartInfoForReader +{ +public: + explicit LoadedMergeTreeDataPartInfoForReader(MergeTreeData::DataPartPtr data_part_) + : IMergeTreeDataPartInfoForReader(data_part_->storage.getContext()) + , data_part(data_part_) + {} + + bool isCompactPart() const override { return DB::isCompactPart(data_part); } + + bool isWidePart() const override { return DB::isWidePart(data_part); } + + bool isInMemoryPart() const override { return DB::isInMemoryPart(data_part); } + + bool isProjectionPart() const override { return data_part->isProjectionPart(); } + + const DataPartStoragePtr & getDataPartStorage() const override { return data_part->data_part_storage; } + + const NamesAndTypesList & getColumns() const override { return data_part->getColumns(); } + + std::optional getColumnPosition(const String & column_name) const override { return data_part->getColumnPosition(column_name); } + + AlterConversions getAlterConversions() const override { return data_part->storage.getAlterConversionsForPart(data_part); } + + String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const override { return data_part->getColumnNameWithMinimumCompressedSize(with_subcolumns); } + + const MergeTreeDataPartChecksums & getChecksums() const override { return data_part->checksums; } + + void reportBroken() override { data_part->storage.reportBrokenPart(data_part); } + + size_t getMarksCount() const override { return data_part->getMarksCount(); } + + size_t getFileSizeOrZero(const std::string & file_name) const override { return data_part->getFileSizeOrZero(file_name); } + + const MergeTreeIndexGranularityInfo & getIndexGranularityInfo() const override { return data_part->index_granularity_info; } + + const MergeTreeIndexGranularity & getIndexGranularity() const override { return data_part->index_granularity; } + + const SerializationInfoByName & getSerializationInfos() const override { return data_part->getSerializationInfos(); } + + SerializationPtr getSerialization(const NameAndTypePair & column) const override { return data_part->getSerialization(column.name); } + +private: + MergeTreeData::DataPartPtr data_part; +}; + +} diff --git a/src/Storages/MergeTree/MarkRange.cpp b/src/Storages/MergeTree/MarkRange.cpp index 343c4ecaf22..903940efa94 100644 --- a/src/Storages/MergeTree/MarkRange.cpp +++ b/src/Storages/MergeTree/MarkRange.cpp @@ -36,4 +36,16 @@ size_t getLastMark(const MarkRanges & ranges) return current_task_last_mark; } +std::string toString(const MarkRanges & ranges) +{ + std::string result; + for (const auto & mark_range : ranges) + { + if (!result.empty()) + result += ", "; + result += "(" + std::to_string(mark_range.begin) + ", " + std::to_string(mark_range.end) + ")"; + } + return result; +} + } diff --git a/src/Storages/MergeTree/MarkRange.h b/src/Storages/MergeTree/MarkRange.h index 4f32be6ab14..fe02eb056b7 100644 --- a/src/Storages/MergeTree/MarkRange.h +++ b/src/Storages/MergeTree/MarkRange.h @@ -32,4 +32,6 @@ using MarkRanges = std::deque; */ size_t getLastMark(const MarkRanges & ranges); +std::string toString(const MarkRanges & ranges); + } diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index e2cd797ab92..475407a402b 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -43,6 +43,7 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( , storage(storage_) , storage_snapshot(storage_snapshot_) , prewhere_info(prewhere_info_) + , prewhere_actions(getPrewhereActions(prewhere_info, actions_settings)) , max_block_size_rows(max_block_size_rows_) , preferred_block_size_bytes(preferred_block_size_bytes_) , preferred_max_column_in_block_size_bytes(preferred_max_column_in_block_size_bytes_) @@ -72,7 +73,12 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( header_without_virtual_columns.erase(*it); } } +} + +std::unique_ptr MergeTreeBaseSelectProcessor::getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings) +{ + std::unique_ptr prewhere_actions; if (prewhere_info) { prewhere_actions = std::make_unique(); @@ -100,6 +106,8 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( prewhere_actions->steps.emplace_back(std::move(prewhere_step)); } + + return prewhere_actions; } @@ -262,45 +270,62 @@ void MergeTreeBaseSelectProcessor::initializeMergeTreeReadersForPart( void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & current_task) { - MergeTreeRangeReader* prev_reader = nullptr; + return initializeRangeReadersImpl( + current_task.range_reader, current_task.pre_range_readers, prewhere_info, prewhere_actions.get(), + reader.get(), current_task.data_part->hasLightweightDelete(), reader_settings, + pre_reader_for_step, lightweight_delete_filter_step, non_const_virtual_column_names); +} + +void MergeTreeBaseSelectProcessor::initializeRangeReadersImpl( + MergeTreeRangeReader & range_reader, std::deque & pre_range_readers, + PrewhereInfoPtr prewhere_info, const PrewhereExprInfo * prewhere_actions, + IMergeTreeReader * reader, bool has_lightweight_delete, const MergeTreeReaderSettings & reader_settings, + const std::vector> & pre_reader_for_step, + const PrewhereExprStep & lightweight_delete_filter_step, const Names & non_const_virtual_column_names) +{ + MergeTreeRangeReader * prev_reader = nullptr; bool last_reader = false; size_t pre_readers_shift = 0; /// Add filtering step with lightweight delete mask - if (reader_settings.apply_deleted_mask && current_task.data_part->hasLightweightDelete()) + if (reader_settings.apply_deleted_mask && has_lightweight_delete) { - current_task.pre_range_readers.push_back( - MergeTreeRangeReader(pre_reader_for_step[0].get(), prev_reader, &lightweight_delete_filter_step, last_reader, non_const_virtual_column_names)); - prev_reader = ¤t_task.pre_range_readers.back(); + MergeTreeRangeReader pre_range_reader(pre_reader_for_step[0].get(), prev_reader, &lightweight_delete_filter_step, last_reader, non_const_virtual_column_names); + pre_range_readers.push_back(std::move(pre_range_reader)); + prev_reader = &pre_range_readers.back(); pre_readers_shift++; } if (prewhere_info) { if (prewhere_actions->steps.size() + pre_readers_shift != pre_reader_for_step.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "PREWHERE steps count mismatch, actions: {}, readers: {}", - prewhere_actions->steps.size(), pre_reader_for_step.size()); + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "PREWHERE steps count mismatch, actions: {}, readers: {}", + prewhere_actions->steps.size(), pre_reader_for_step.size()); + } for (size_t i = 0; i < prewhere_actions->steps.size(); ++i) { last_reader = reader->getColumns().empty() && (i + 1 == prewhere_actions->steps.size()); - current_task.pre_range_readers.push_back( - MergeTreeRangeReader(pre_reader_for_step[i + pre_readers_shift].get(), prev_reader, &prewhere_actions->steps[i], last_reader, non_const_virtual_column_names)); - prev_reader = ¤t_task.pre_range_readers.back(); + MergeTreeRangeReader current_reader(pre_reader_for_step[i + pre_readers_shift].get(), prev_reader, &prewhere_actions->steps[i], last_reader, non_const_virtual_column_names); + + pre_range_readers.push_back(std::move(current_reader)); + prev_reader = &pre_range_readers.back(); } } if (!last_reader) { - current_task.range_reader = MergeTreeRangeReader(reader.get(), prev_reader, nullptr, true, non_const_virtual_column_names); + range_reader = MergeTreeRangeReader(reader, prev_reader, nullptr, true, non_const_virtual_column_names); } else { /// If all columns are read by pre_range_readers than move last pre_range_reader into range_reader - current_task.range_reader = std::move(current_task.pre_range_readers.back()); - current_task.pre_range_readers.pop_back(); + range_reader = std::move(pre_range_readers.back()); + pre_range_readers.pop_back(); } } diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h index aa1b9d3541e..051854d8bc1 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h @@ -89,6 +89,20 @@ protected: static void injectVirtualColumns(Block & block, size_t row_count, MergeTreeReadTask * task, const DataTypePtr & partition_value_type, const Names & virtual_columns); + static std::unique_ptr getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings); + + static void initializeRangeReadersImpl( + MergeTreeRangeReader & range_reader, + std::deque & pre_range_readers, + PrewhereInfoPtr prewhere_info, + const PrewhereExprInfo * prewhere_actions, + IMergeTreeReader * reader, + bool has_lightweight_delete, + const MergeTreeReaderSettings & reader_settings, + const std::vector> & pre_reader_for_step, + const PrewhereExprStep & lightweight_delete_filter_step, + const Names & non_const_virtual_column_names); + /// Sets up data readers for each step of prewhere and where void initializeMergeTreeReadersForPart( MergeTreeData::DataPartPtr & data_part, diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index 62fac84fc36..c3f069498be 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -28,8 +29,8 @@ namespace bool injectRequiredColumnsRecursively( const String & column_name, const StorageSnapshotPtr & storage_snapshot, - const MergeTreeData::AlterConversions & alter_conversions, - const MergeTreeData::DataPartPtr & part, + const AlterConversions & alter_conversions, + const IMergeTreeDataPartInfoForReader & data_part_info_for_reader, const GetColumnsOptions & options, Names & columns, NameSet & required_columns, @@ -47,7 +48,7 @@ bool injectRequiredColumnsRecursively( if (alter_conversions.isColumnRenamed(column_name_in_part)) column_name_in_part = alter_conversions.getColumnOldName(column_name_in_part); - auto column_in_part = part->getColumns().tryGetByName(column_name_in_part); + auto column_in_part = data_part_info_for_reader.getColumns().tryGetByName(column_name_in_part); if (column_in_part && (!column_in_storage->isSubcolumn() @@ -78,7 +79,7 @@ bool injectRequiredColumnsRecursively( bool result = false; for (const auto & identifier : identifiers) result |= injectRequiredColumnsRecursively( - identifier, storage_snapshot, alter_conversions, part, + identifier, storage_snapshot, alter_conversions, data_part_info_for_reader, options, columns, required_columns, injected_columns); return result; @@ -87,9 +88,8 @@ bool injectRequiredColumnsRecursively( } NameSet injectRequiredColumns( - const MergeTreeData & storage, + const IMergeTreeDataPartInfoForReader & data_part_info_for_reader, const StorageSnapshotPtr & storage_snapshot, - const MergeTreeData::DataPartPtr & part, bool with_subcolumns, Names & columns) { @@ -97,9 +97,9 @@ NameSet injectRequiredColumns( NameSet injected_columns; bool have_at_least_one_physical_column = false; - MergeTreeData::AlterConversions alter_conversions; - if (!part->isProjectionPart()) - alter_conversions = storage.getAlterConversionsForPart(part); + AlterConversions alter_conversions; + if (!data_part_info_for_reader.isProjectionPart()) + alter_conversions = data_part_info_for_reader.getAlterConversions(); auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical) .withExtendedObjects() @@ -115,7 +115,7 @@ NameSet injectRequiredColumns( have_at_least_one_physical_column |= injectRequiredColumnsRecursively( columns[i], storage_snapshot, alter_conversions, - part, options, columns, required_columns, injected_columns); + data_part_info_for_reader, options, columns, required_columns, injected_columns); } /** Add a column of the minimum size. @@ -124,7 +124,7 @@ NameSet injectRequiredColumns( */ if (!have_at_least_one_physical_column) { - const auto minimum_size_column_name = part->getColumnNameWithMinimumCompressedSize(with_subcolumns); + const auto minimum_size_column_name = data_part_info_for_reader.getColumnNameWithMinimumCompressedSize(with_subcolumns); columns.push_back(minimum_size_column_name); /// correctly report added column injected_columns.insert(columns.back()); @@ -135,13 +135,22 @@ NameSet injectRequiredColumns( MergeTreeReadTask::MergeTreeReadTask( - const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, size_t part_index_in_query_, - const Names & ordered_names_, const NameSet & column_name_set_, const MergeTreeReadTaskColumns & task_columns_, + const MergeTreeData::DataPartPtr & data_part_, + const MarkRanges & mark_ranges_, + size_t part_index_in_query_, + const Names & ordered_names_, + const NameSet & column_name_set_, + const MergeTreeReadTaskColumns & task_columns_, bool remove_prewhere_column_, MergeTreeBlockSizePredictorPtr && size_predictor_) - : data_part{data_part_}, mark_ranges{mark_ranges_}, part_index_in_query{part_index_in_query_}, - ordered_names{ordered_names_}, column_name_set{column_name_set_}, task_columns{task_columns_}, - remove_prewhere_column{remove_prewhere_column_}, size_predictor{std::move(size_predictor_)} + : data_part{data_part_} + , mark_ranges{mark_ranges_} + , part_index_in_query{part_index_in_query_} + , ordered_names{ordered_names_} + , column_name_set{column_name_set_} + , task_columns{task_columns_} + , remove_prewhere_column{remove_prewhere_column_} + , size_predictor{std::move(size_predictor_)} { } @@ -270,9 +279,8 @@ void MergeTreeBlockSizePredictor::update(const Block & sample_block, const Colum MergeTreeReadTaskColumns getReadTaskColumns( - const MergeTreeData & storage, + const IMergeTreeDataPartInfoForReader & data_part_info_for_reader, const StorageSnapshotPtr & storage_snapshot, - const MergeTreeData::DataPartPtr & data_part, const Names & required_columns, const Names & system_columns, const PrewhereInfoPtr & prewhere_info, @@ -284,13 +292,13 @@ MergeTreeReadTaskColumns getReadTaskColumns( /// Read system columns such as lightweight delete mask "_row_exists" if it is persisted in the part for (const auto & name : system_columns) { - if (data_part->getColumns().contains(name)) + if (data_part_info_for_reader.getColumns().contains(name)) column_names.push_back(name); } /// inject columns required for defaults evaluation injectRequiredColumns( - storage, storage_snapshot, data_part, with_subcolumns, column_names); + data_part_info_for_reader, storage_snapshot, with_subcolumns, column_names); MergeTreeReadTaskColumns result; auto options = GetColumnsOptions(GetColumnsOptions::All) @@ -316,7 +324,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( Names all_pre_column_names = prewhere_info->prewhere_actions->getRequiredColumnsNames(); const auto injected_pre_columns = injectRequiredColumns( - storage, storage_snapshot, data_part, with_subcolumns, all_pre_column_names); + data_part_info_for_reader, storage_snapshot, with_subcolumns, all_pre_column_names); for (const auto & name : all_pre_column_names) { diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h index 5a36955b4d3..e1c06869bb7 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h @@ -12,6 +12,7 @@ namespace DB class MergeTreeData; struct MergeTreeReadTask; struct MergeTreeBlockSizePredictor; +class IMergeTreeDataPartInfoForReader; using MergeTreeReadTaskPtr = std::unique_ptr; using MergeTreeBlockSizePredictorPtr = std::shared_ptr; @@ -23,9 +24,8 @@ using MergeTreeBlockSizePredictorPtr = std::shared_ptr #include #include +#include #include #include #include @@ -167,20 +168,6 @@ public: STRONG_TYPEDEF(String, PartitionID) - /// Alter conversions which should be applied on-fly for part. Build from of - /// the most recent mutation commands for part. Now we have only rename_map - /// here (from ALTER_RENAME) command, because for all other type of alters - /// we can deduce conversions for part from difference between - /// part->getColumns() and storage->getColumns(). - struct AlterConversions - { - /// Rename map new_name -> old_name - std::unordered_map rename_map; - - bool isColumnRenamed(const String & new_name) const { return rename_map.count(new_name) > 0; } - String getColumnOldName(const String & new_name) const { return rename_map.at(new_name); } - }; - struct LessDataPart { using is_transparent = void; diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index 046a7d274c0..14b3e33a157 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB @@ -45,9 +46,9 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartCompact::getReader( const ValueSizeMap & avg_value_size_hints, const ReadBufferFromFileBase::ProfileCallback & profile_callback) const { - auto ptr = std::static_pointer_cast(shared_from_this()); + auto read_info = std::make_shared(shared_from_this()); return std::make_unique( - ptr, columns_to_read, metadata_snapshot, uncompressed_cache, + read_info, columns_to_read, metadata_snapshot, uncompressed_cache, mark_cache, mark_ranges, reader_settings, avg_value_size_hints, profile_callback); } @@ -90,39 +91,44 @@ void MergeTreeDataPartCompact::calculateEachColumnSizes(ColumnSizeByName & /*eac total_size.marks += mrk_checksum->second.file_size; } -void MergeTreeDataPartCompact::loadIndexGranularity() +void MergeTreeDataPartCompact::loadIndexGranularityImpl( + MergeTreeIndexGranularity & index_granularity_, const MergeTreeIndexGranularityInfo & index_granularity_info_, + const NamesAndTypesList & columns_, const DataPartStoragePtr & data_part_storage_) { - //String full_path = getRelativePath(); - - if (columns.empty()) - throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); - - if (!index_granularity_info.is_adaptive) + if (!index_granularity_info_.is_adaptive) throw Exception("MergeTreeDataPartCompact cannot be created with non-adaptive granulary.", ErrorCodes::NOT_IMPLEMENTED); - auto marks_file_path = index_granularity_info.getMarksFilePath("data"); - if (!data_part_storage->exists(marks_file_path)) + auto marks_file_path = index_granularity_info_.getMarksFilePath("data"); + if (!data_part_storage_->exists(marks_file_path)) throw Exception( ErrorCodes::NO_FILE_IN_DATA_PART, "Marks file '{}' doesn't exist", - std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path)); + std::string(fs::path(data_part_storage_->getFullPath()) / marks_file_path)); - size_t marks_file_size = data_part_storage->getFileSize(marks_file_path); + size_t marks_file_size = data_part_storage_->getFileSize(marks_file_path); - auto buffer = data_part_storage->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt); + auto buffer = data_part_storage_->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt); while (!buffer->eof()) { /// Skip offsets for columns - buffer->seek(columns.size() * sizeof(MarkInCompressedFile), SEEK_CUR); + buffer->seek(columns_.size() * sizeof(MarkInCompressedFile), SEEK_CUR); size_t granularity; readIntBinary(granularity, *buffer); - index_granularity.appendMark(granularity); + index_granularity_.appendMark(granularity); } - if (index_granularity.getMarksCount() * index_granularity_info.getMarkSizeInBytes(columns.size()) != marks_file_size) + if (index_granularity_.getMarksCount() * index_granularity_info_.getMarkSizeInBytes(columns_.size()) != marks_file_size) throw Exception("Cannot read all marks from file " + marks_file_path, ErrorCodes::CANNOT_READ_ALL_DATA); - index_granularity.setInitialized(); + index_granularity_.setInitialized(); +} + +void MergeTreeDataPartCompact::loadIndexGranularity() +{ + if (columns.empty()) + throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); + + loadIndexGranularityImpl(index_granularity, index_granularity_info, columns, data_part_storage); } bool MergeTreeDataPartCompact::hasColumnFiles(const NameAndTypePair & column) const diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h index b1c0851afde..26c335f4324 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -65,6 +65,11 @@ public: ~MergeTreeDataPartCompact() override; +protected: + static void loadIndexGranularityImpl( + MergeTreeIndexGranularity & index_granularity_, const MergeTreeIndexGranularityInfo & index_granularity_info_, + const NamesAndTypesList & columns_, const DataPartStoragePtr & data_part_storage_); + private: void checkConsistency(bool require_part_metadata) const override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp index 1c5006f4211..c7c831c23ec 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -48,9 +49,10 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartInMemory::getReader( const ValueSizeMap & /* avg_value_size_hints */, const ReadBufferFromFileBase::ProfileCallback & /* profile_callback */) const { + auto read_info = std::make_shared(shared_from_this()); auto ptr = std::static_pointer_cast(shared_from_this()); return std::make_unique( - ptr, columns_to_read, metadata_snapshot, mark_ranges, reader_settings); + read_info, ptr, columns_to_read, metadata_snapshot, mark_ranges, reader_settings); } IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartInMemory::getWriter( diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index c7b6ff0c4dd..58a0e48caab 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -47,9 +48,9 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartWide::getReader( const ValueSizeMap & avg_value_size_hints, const ReadBufferFromFileBase::ProfileCallback & profile_callback) const { - auto ptr = std::static_pointer_cast(shared_from_this()); + auto read_info = std::make_shared(shared_from_this()); return std::make_unique( - ptr, columns_to_read, + read_info, columns_to_read, metadata_snapshot, uncompressed_cache, mark_cache, mark_ranges, reader_settings, avg_value_size_hints, profile_callback); @@ -103,46 +104,52 @@ ColumnSize MergeTreeDataPartWide::getColumnSizeImpl( return size; } -void MergeTreeDataPartWide::loadIndexGranularity() +void MergeTreeDataPartWide::loadIndexGranularityImpl( + MergeTreeIndexGranularity & index_granularity_, MergeTreeIndexGranularityInfo & index_granularity_info_, + const DataPartStoragePtr & data_part_storage_, const std::string & any_column_file_name) { - index_granularity_info.changeGranularityIfRequired(data_part_storage); - - - if (columns.empty()) - throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); + index_granularity_info_.changeGranularityIfRequired(data_part_storage_); /// We can use any column, it doesn't matter - std::string marks_file_path = index_granularity_info.getMarksFilePath(getFileNameForColumn(columns.front())); - if (!data_part_storage->exists(marks_file_path)) + std::string marks_file_path = index_granularity_info_.getMarksFilePath(any_column_file_name); + if (!data_part_storage_->exists(marks_file_path)) throw Exception( ErrorCodes::NO_FILE_IN_DATA_PART, "Marks file '{}' doesn't exist", - std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path)); + std::string(fs::path(data_part_storage_->getFullPath()) / marks_file_path)); - size_t marks_file_size = data_part_storage->getFileSize(marks_file_path); + size_t marks_file_size = data_part_storage_->getFileSize(marks_file_path); - if (!index_granularity_info.is_adaptive) + if (!index_granularity_info_.is_adaptive) { - size_t marks_count = marks_file_size / index_granularity_info.getMarkSizeInBytes(); - index_granularity.resizeWithFixedGranularity(marks_count, index_granularity_info.fixed_index_granularity); /// all the same + size_t marks_count = marks_file_size / index_granularity_info_.getMarkSizeInBytes(); + index_granularity_.resizeWithFixedGranularity(marks_count, index_granularity_info_.fixed_index_granularity); /// all the same } else { - auto buffer = data_part_storage->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt); + auto buffer = data_part_storage_->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt); while (!buffer->eof()) { buffer->seek(sizeof(size_t) * 2, SEEK_CUR); /// skip offset_in_compressed file and offset_in_decompressed_block size_t granularity; readIntBinary(granularity, *buffer); - index_granularity.appendMark(granularity); + index_granularity_.appendMark(granularity); } - if (index_granularity.getMarksCount() * index_granularity_info.getMarkSizeInBytes() != marks_file_size) + if (index_granularity_.getMarksCount() * index_granularity_info_.getMarkSizeInBytes() != marks_file_size) throw Exception( ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all marks from file {}", - std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path)); + std::string(fs::path(data_part_storage_->getFullPath()) / marks_file_path)); } - index_granularity.setInitialized(); + index_granularity_.setInitialized(); +} + +void MergeTreeDataPartWide::loadIndexGranularity() +{ + if (columns.empty()) + throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); + + loadIndexGranularityImpl(index_granularity, index_granularity_info, data_part_storage, getFileNameForColumn(columns.front())); } bool MergeTreeDataPartWide::isStoredOnRemoteDisk() const diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.h b/src/Storages/MergeTree/MergeTreeDataPartWide.h index 325193557b3..52afa9e82d4 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h @@ -61,6 +61,11 @@ public: bool hasColumnFiles(const NameAndTypePair & column) const override; +protected: + static void loadIndexGranularityImpl( + MergeTreeIndexGranularity & index_granularity_, MergeTreeIndexGranularityInfo & index_granularity_info_, + const DataPartStoragePtr & data_part_storage_, const std::string & any_column_file_name); + private: void checkConsistency(bool require_part_metadata) const override; diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h index dbb027c244e..a5adc919f4f 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h +++ b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h @@ -29,6 +29,8 @@ public: MergeTreeIndexGranularityInfo(const MergeTreeData & storage, MergeTreeDataPartType type_); + MergeTreeIndexGranularityInfo(MergeTreeDataPartType type_, bool is_adaptive_, size_t index_granularity_, size_t index_granularity_bytes_); + void changeGranularityIfRequired(const DataPartStoragePtr & data_part_storage); String getMarksFilePath(const String & path_prefix) const diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 79854785016..1a5a4d91806 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -83,7 +83,7 @@ MergeTreeRangeReader::DelayedStream::DelayedStream( : current_mark(from_mark), current_offset(0), num_delayed_rows(0) , current_task_last_mark(current_task_last_mark_) , merge_tree_reader(merge_tree_reader_) - , index_granularity(&(merge_tree_reader->data_part->index_granularity)) + , index_granularity(&(merge_tree_reader->data_part_info_for_read->getIndexGranularity())) , continue_reading(false), is_finished(false) { } @@ -181,7 +181,7 @@ MergeTreeRangeReader::Stream::Stream( : current_mark(from_mark), offset_after_current_mark(0) , last_mark(to_mark) , merge_tree_reader(merge_tree_reader_) - , index_granularity(&(merge_tree_reader->data_part->index_granularity)) + , index_granularity(&(merge_tree_reader->data_part_info_for_read->getIndexGranularity())) , current_mark_index_granularity(index_granularity->getMarkRows(from_mark)) , stream(from_mark, current_task_last_mark, merge_tree_reader) { @@ -652,7 +652,7 @@ MergeTreeRangeReader::MergeTreeRangeReader( bool last_reader_in_chain_, const Names & non_const_virtual_column_names_) : merge_tree_reader(merge_tree_reader_) - , index_granularity(&(merge_tree_reader->data_part->index_granularity)) + , index_granularity(&(merge_tree_reader->data_part_info_for_read->getIndexGranularity())) , prev_reader(prev_reader_) , prewhere_info(prewhere_info_) , last_reader_in_chain(last_reader_in_chain_) @@ -946,7 +946,8 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::startReadingChain(size_t result.addRows(stream.finalize(result.columns)); /// Last granule may be incomplete. - result.adjustLastGranule(); + if (!result.rowsPerGranule().empty()) + result.adjustLastGranule(); for (const auto & column_name : non_const_virtual_column_names) { diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index cc2c20eda5a..3f51673a6b1 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -22,7 +23,6 @@ MergeTreeReadPool::MergeTreeReadPool( size_t sum_marks_, size_t min_marks_for_concurrent_read_, RangesInDataParts && parts_, - const MergeTreeData & data_, const StorageSnapshotPtr & storage_snapshot_, const PrewhereInfoPtr & prewhere_info_, const Names & column_names_, @@ -32,7 +32,6 @@ MergeTreeReadPool::MergeTreeReadPool( bool do_not_steal_tasks_) : backoff_settings{backoff_settings_} , backoff_state{threads_} - , data{data_} , storage_snapshot{storage_snapshot_} , column_names{column_names_} , virtual_column_names{virtual_column_names_} @@ -214,7 +213,7 @@ std::vector MergeTreeReadPool::fillPerPartInfo(const RangesInDataParts & per_part_sum_marks.push_back(sum_marks); auto task_columns = getReadTaskColumns( - data, storage_snapshot, part.data_part, + LoadedMergeTreeDataPartInfoForReader(part.data_part), storage_snapshot, column_names, virtual_column_names, prewhere_info, /*with_subcolumns=*/ true); auto size_predictor = !predict_block_size_bytes ? nullptr diff --git a/src/Storages/MergeTree/MergeTreeReadPool.h b/src/Storages/MergeTree/MergeTreeReadPool.h index 01a1280b6cb..c9fe70d9a78 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.h +++ b/src/Storages/MergeTree/MergeTreeReadPool.h @@ -70,11 +70,16 @@ private: public: MergeTreeReadPool( - size_t threads_, size_t sum_marks_, size_t min_marks_for_concurrent_read_, - RangesInDataParts && parts_, const MergeTreeData & data_, const StorageSnapshotPtr & storage_snapshot_, + size_t threads_, + size_t sum_marks_, + size_t min_marks_for_concurrent_read_, + RangesInDataParts && parts_, + const StorageSnapshotPtr & storage_snapshot_, const PrewhereInfoPtr & prewhere_info_, - const Names & column_names_, const Names & virtual_column_names_, - const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_, + const Names & column_names_, + const Names & virtual_column_names_, + const BackoffSettings & backoff_settings_, + size_t preferred_block_size_bytes_, bool do_not_steal_tasks_ = false); MergeTreeReadTaskPtr getTask(size_t min_marks_to_read, size_t thread, const Names & ordered_names); @@ -94,7 +99,6 @@ private: size_t threads, size_t sum_marks, std::vector per_part_sum_marks, const RangesInDataParts & parts, size_t min_marks_for_concurrent_read); - const MergeTreeData & data; StorageSnapshotPtr storage_snapshot; const Names column_names; const Names virtual_column_names; diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp index 44b5fa1a11b..413e6838665 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp @@ -15,7 +15,7 @@ namespace ErrorCodes MergeTreeReaderCompact::MergeTreeReaderCompact( - DataPartCompactPtr data_part_, + MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_, NamesAndTypesList columns_, const StorageMetadataPtr & metadata_snapshot_, UncompressedCache * uncompressed_cache_, @@ -26,7 +26,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( const ReadBufferFromFileBase::ProfileCallback & profile_callback_, clockid_t clock_type_) : IMergeTreeReader( - data_part_, + data_part_info_for_read_, columns_, metadata_snapshot_, uncompressed_cache_, @@ -35,14 +35,14 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( settings_, avg_value_size_hints_) , marks_loader( - data_part->data_part_storage, + data_part_info_for_read_->getDataPartStorage(), mark_cache, - data_part->index_granularity_info.getMarksFilePath(MergeTreeDataPartCompact::DATA_FILE_NAME), - data_part->getMarksCount(), - data_part->index_granularity_info, + data_part_info_for_read_->getIndexGranularityInfo().getMarksFilePath(MergeTreeDataPartCompact::DATA_FILE_NAME), + data_part_info_for_read_->getMarksCount(), + data_part_info_for_read_->getIndexGranularityInfo(), settings.save_marks_in_cache, settings.read_settings, - data_part->getColumns().size()) + data_part_info_for_read_->getColumns().size()) { try { @@ -64,7 +64,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( continue; } - auto position = data_part->getColumnPosition(column_to_read.getNameInStorage()); + auto position = data_part_info_for_read->getColumnPosition(column_to_read.getNameInStorage()); if (!position && typeid_cast(column_to_read.type.get())) { /// If array of Nested column is missing in part, @@ -77,7 +77,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( } /// Do not use max_read_buffer_size, but try to lower buffer size with maximal size of granule to avoid reading much data. - auto buffer_size = getReadBufferSize(data_part, marks_loader, column_positions, all_mark_ranges); + auto buffer_size = getReadBufferSize(*data_part_info_for_read, marks_loader, column_positions, all_mark_ranges); if (buffer_size) settings.read_settings = settings.read_settings.adjustBufferSize(buffer_size); @@ -88,10 +88,10 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( if (uncompressed_cache) { auto buffer = std::make_unique( - std::string(fs::path(data_part->data_part_storage->getFullPath()) / path), + std::string(fs::path(data_part_info_for_read->getDataPartStorage()->getFullPath()) / path), [this, path]() { - return data_part->data_part_storage->readFile( + return data_part_info_for_read->getDataPartStorage()->readFile( path, settings.read_settings, std::nullopt, std::nullopt); @@ -113,7 +113,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( { auto buffer = std::make_unique( - data_part->data_part_storage->readFile( + data_part_info_for_read->getDataPartStorage()->readFile( path, settings.read_settings, std::nullopt, std::nullopt), @@ -132,7 +132,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( } catch (...) { - storage.reportBrokenPart(data_part); + data_part_info_for_read->reportBroken(); throw; } } @@ -156,7 +156,7 @@ size_t MergeTreeReaderCompact::readRows( while (read_rows < max_rows_to_read) { - size_t rows_to_read = data_part->index_granularity.getMarkRows(from_mark); + size_t rows_to_read = data_part_info_for_read->getIndexGranularity().getMarkRows(from_mark); for (size_t pos = 0; pos < num_columns; ++pos) { @@ -179,7 +179,7 @@ size_t MergeTreeReaderCompact::readRows( catch (Exception & e) { if (e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED) - storage.reportBrokenPart(data_part); + data_part_info_for_read->reportBroken(); /// Better diagnostics. e.addMessage("(while reading column " + columns_to_read[pos].name + ")"); @@ -187,7 +187,7 @@ size_t MergeTreeReaderCompact::readRows( } catch (...) { - storage.reportBrokenPart(data_part); + data_part_info_for_read->reportBroken(); throw; } } @@ -279,7 +279,7 @@ void MergeTreeReaderCompact::seekToMark(size_t row_index, size_t column_index) void MergeTreeReaderCompact::adjustUpperBound(size_t last_mark) { size_t right_offset = 0; - if (last_mark < data_part->getMarksCount()) /// Otherwise read until the end of file + if (last_mark < data_part_info_for_read->getMarksCount()) /// Otherwise read until the end of file right_offset = marks_loader.getMark(last_mark).offset_in_compressed_file; if (right_offset == 0) @@ -307,7 +307,7 @@ bool MergeTreeReaderCompact::isContinuousReading(size_t mark, size_t column_posi return false; const auto & [last_mark, last_column] = *last_read_granule; return (mark == last_mark && column_position == last_column + 1) - || (mark == last_mark + 1 && column_position == 0 && last_column == data_part->getColumns().size() - 1); + || (mark == last_mark + 1 && column_position == 0 && last_column == data_part_info_for_read->getColumns().size() - 1); } namespace @@ -359,16 +359,16 @@ private: } size_t MergeTreeReaderCompact::getReadBufferSize( - const DataPartPtr & part, + const IMergeTreeDataPartInfoForReader & data_part_info_for_reader, MergeTreeMarksLoader & marks_loader, const ColumnPositions & column_positions, const MarkRanges & mark_ranges) { size_t buffer_size = 0; size_t columns_num = column_positions.size(); - size_t file_size = part->getFileSizeOrZero(MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION); + size_t file_size = data_part_info_for_reader.getFileSizeOrZero(MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION); - MarksCounter counter(part->getMarksCount(), part->getColumns().size()); + MarksCounter counter(data_part_info_for_reader.getMarksCount(), data_part_info_for_reader.getColumns().size()); for (const auto & mark_range : mark_ranges) { diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.h b/src/Storages/MergeTree/MergeTreeReaderCompact.h index aa0eb949aa1..9a426fda71d 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.h +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.h @@ -19,7 +19,7 @@ class MergeTreeReaderCompact : public IMergeTreeReader { public: MergeTreeReaderCompact( - DataPartCompactPtr data_part_, + MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_, NamesAndTypesList columns_, const StorageMetadataPtr & metadata_snapshot_, UncompressedCache * uncompressed_cache_, @@ -67,7 +67,7 @@ private: /// Returns maximal value of granule size in compressed file from @mark_ranges. /// This value is used as size of read buffer. static size_t getReadBufferSize( - const DataPartPtr & part, + const IMergeTreeDataPartInfoForReader & data_part_info_for_read_, MergeTreeMarksLoader & marks_loader, const ColumnPositions & column_positions, const MarkRanges & mark_ranges); diff --git a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp index 766c28c99b9..c392199fa9e 100644 --- a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp @@ -16,13 +16,14 @@ namespace ErrorCodes MergeTreeReaderInMemory::MergeTreeReaderInMemory( + MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_, DataPartInMemoryPtr data_part_, NamesAndTypesList columns_, const StorageMetadataPtr & metadata_snapshot_, MarkRanges mark_ranges_, MergeTreeReaderSettings settings_) : IMergeTreeReader( - data_part_, + data_part_info_for_read_, columns_, metadata_snapshot_, nullptr, @@ -48,7 +49,7 @@ size_t MergeTreeReaderInMemory::readRows( if (!continue_reading) total_rows_read = 0; - size_t total_marks = data_part->index_granularity.getMarksCount(); + size_t total_marks = data_part_info_for_read->getIndexGranularity().getMarksCount(); if (from_mark >= total_marks) throw Exception("Mark " + toString(from_mark) + " is out of bound. Max mark: " + toString(total_marks), ErrorCodes::ARGUMENT_OUT_OF_BOUND); diff --git a/src/Storages/MergeTree/MergeTreeReaderInMemory.h b/src/Storages/MergeTree/MergeTreeReaderInMemory.h index ff6eb92d9c3..cb67bc46eae 100644 --- a/src/Storages/MergeTree/MergeTreeReaderInMemory.h +++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.h @@ -15,6 +15,7 @@ class MergeTreeReaderInMemory : public IMergeTreeReader { public: MergeTreeReaderInMemory( + MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_, DataPartInMemoryPtr data_part_, NamesAndTypesList columns_, const StorageMetadataPtr & metadata_snapshot_, diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp index 1274017b865..5a048e8bc1a 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp @@ -26,7 +26,7 @@ namespace ErrorCodes } MergeTreeReaderWide::MergeTreeReaderWide( - DataPartWidePtr data_part_, + MergeTreeDataPartInfoForReaderPtr data_part_info_, NamesAndTypesList columns_, const StorageMetadataPtr & metadata_snapshot_, UncompressedCache * uncompressed_cache_, @@ -37,7 +37,7 @@ MergeTreeReaderWide::MergeTreeReaderWide( const ReadBufferFromFileBase::ProfileCallback & profile_callback_, clockid_t clock_type_) : IMergeTreeReader( - data_part_, + data_part_info_, columns_, metadata_snapshot_, uncompressed_cache_, @@ -53,7 +53,7 @@ MergeTreeReaderWide::MergeTreeReaderWide( } catch (...) { - storage.reportBrokenPart(data_part); + data_part_info_for_read->reportBroken(); throw; } } @@ -73,7 +73,7 @@ size_t MergeTreeReaderWide::readRows( std::unordered_map caches; std::unordered_set prefetched_streams; - if (data_part->data_part_storage->isStoredOnRemoteDisk() ? settings.read_settings.remote_fs_prefetch : settings.read_settings.local_fs_prefetch) + if (data_part_info_for_read->getDataPartStorage()->isStoredOnRemoteDisk() ? settings.read_settings.remote_fs_prefetch : settings.read_settings.local_fs_prefetch) { /// Request reading of data in advance, /// so if reading can be asynchronous, it will also be performed in parallel for all columns. @@ -136,17 +136,17 @@ size_t MergeTreeReaderWide::readRows( catch (Exception & e) { if (e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED) - storage.reportBrokenPart(data_part); + data_part_info_for_read->reportBroken(); /// Better diagnostics. - e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + " " + e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + " " "from mark " + toString(from_mark) + " " "with max_rows_to_read = " + toString(max_rows_to_read) + ")"); throw; } catch (...) { - storage.reportBrokenPart(data_part); + data_part_info_for_read->reportBroken(); throw; } @@ -167,7 +167,7 @@ void MergeTreeReaderWide::addStreams( if (streams.contains(stream_name)) return; - bool data_file_exists = data_part->checksums.files.contains(stream_name + DATA_FILE_EXTENSION); + bool data_file_exists = data_part_info_for_read->getChecksums().files.contains(stream_name + DATA_FILE_EXTENSION); /** If data file is missing then we will not try to open it. * It is necessary since it allows to add new column to structure of the table without creating new files for old parts. @@ -178,10 +178,10 @@ void MergeTreeReaderWide::addStreams( bool is_lc_dict = substream_path.size() > 1 && substream_path[substream_path.size() - 2].type == ISerialization::Substream::Type::DictionaryKeys; streams.emplace(stream_name, std::make_unique( - data_part->data_part_storage, stream_name, DATA_FILE_EXTENSION, - data_part->getMarksCount(), all_mark_ranges, settings, mark_cache, - uncompressed_cache, data_part->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION), - &data_part->index_granularity_info, + data_part_info_for_read->getDataPartStorage(), stream_name, DATA_FILE_EXTENSION, + data_part_info_for_read->getMarksCount(), all_mark_ranges, settings, mark_cache, + uncompressed_cache, data_part_info_for_read->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION), + &data_part_info_for_read->getIndexGranularityInfo(), profile_callback, clock_type, is_lc_dict)); }; diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.h b/src/Storages/MergeTree/MergeTreeReaderWide.h index 2137695b6d7..dbfc0310242 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.h +++ b/src/Storages/MergeTree/MergeTreeReaderWide.h @@ -15,7 +15,7 @@ class MergeTreeReaderWide : public IMergeTreeReader { public: MergeTreeReaderWide( - DataPartWidePtr data_part_, + MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_, NamesAndTypesList columns_, const StorageMetadataPtr & metadata_snapshot_, UncompressedCache * uncompressed_cache_, diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index 3e346df6662..59cbae3f914 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include @@ -51,7 +52,7 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( void MergeTreeSelectProcessor::initializeReaders() { task_columns = getReadTaskColumns( - storage, storage_snapshot, data_part, + LoadedMergeTreeDataPartInfoForReader(data_part), storage_snapshot, required_columns, virt_column_names, prewhere_info, /*with_subcolumns=*/ true); /// Will be used to distinguish between PREWHERE and WHERE columns when applying filter diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index 5b9eceece51..9e0c96fd88a 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -102,7 +103,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( addTotalRowsApprox(data_part->rows_count); /// Add columns because we don't want to read empty blocks - injectRequiredColumns(storage, storage_snapshot, data_part, /*with_subcolumns=*/ false, columns_to_read); + injectRequiredColumns(LoadedMergeTreeDataPartInfoForReader(data_part), storage_snapshot, /*with_subcolumns=*/ false, columns_to_read); NamesAndTypesList columns_for_reader; if (take_column_types_from_storage) diff --git a/src/Storages/System/StorageSystemRemoteDataPaths.cpp b/src/Storages/System/StorageSystemRemoteDataPaths.cpp index fe7aaf97970..de7e1911e44 100644 --- a/src/Storages/System/StorageSystemRemoteDataPaths.cpp +++ b/src/Storages/System/StorageSystemRemoteDataPaths.cpp @@ -1,6 +1,7 @@ #include "StorageSystemRemoteDataPaths.h" #include #include +#include #include #include #include @@ -23,6 +24,8 @@ StorageSystemRemoteDataPaths::StorageSystemRemoteDataPaths(const StorageID & tab {"cache_base_path", std::make_shared()}, {"local_path", std::make_shared()}, {"remote_path", std::make_shared()}, + {"size", std::make_shared()}, + {"common_prefix_for_blobs", std::make_shared()}, {"cache_paths", std::make_shared(std::make_shared())}, })); setInMemoryMetadata(storage_metadata); @@ -44,6 +47,8 @@ Pipe StorageSystemRemoteDataPaths::read( MutableColumnPtr col_cache_base_path = ColumnString::create(); MutableColumnPtr col_local_path = ColumnString::create(); MutableColumnPtr col_remote_path = ColumnString::create(); + MutableColumnPtr col_size = ColumnUInt64::create(); + MutableColumnPtr col_namespace = ColumnString::create(); MutableColumnPtr col_cache_paths = ColumnArray::create(ColumnString::create()); auto disks = context->getDisksMap(); @@ -61,7 +66,7 @@ Pipe StorageSystemRemoteDataPaths::read( if (!cache_base_path.empty()) cache = FileCacheFactory::instance().get(cache_base_path); - for (const auto & [local_path, storage_objects] : remote_paths_by_local_path) + for (const auto & [local_path, common_prefox_for_objects, storage_objects] : remote_paths_by_local_path) { for (const auto & object : storage_objects) { @@ -70,6 +75,8 @@ Pipe StorageSystemRemoteDataPaths::read( col_cache_base_path->insert(cache_base_path); col_local_path->insert(local_path); col_remote_path->insert(object.absolute_path); + col_size->insert(object.bytes_size); + col_namespace->insert(common_prefox_for_objects); if (cache) { @@ -91,6 +98,8 @@ Pipe StorageSystemRemoteDataPaths::read( res_columns.emplace_back(std::move(col_cache_base_path)); res_columns.emplace_back(std::move(col_local_path)); res_columns.emplace_back(std::move(col_remote_path)); + res_columns.emplace_back(std::move(col_size)); + res_columns.emplace_back(std::move(col_namespace)); res_columns.emplace_back(std::move(col_cache_paths)); UInt64 num_rows = res_columns.at(0)->size(); From 33f541042a4e07236983a7c68d012b704f8d78ab Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 6 Sep 2022 00:04:50 +0200 Subject: [PATCH 11/24] Fix clang tidy --- src/Storages/MergeTree/MergeTreeReaderCompact.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.h b/src/Storages/MergeTree/MergeTreeReaderCompact.h index 9a426fda71d..afc360adc51 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.h +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.h @@ -67,7 +67,7 @@ private: /// Returns maximal value of granule size in compressed file from @mark_ranges. /// This value is used as size of read buffer. static size_t getReadBufferSize( - const IMergeTreeDataPartInfoForReader & data_part_info_for_read_, + const IMergeTreeDataPartInfoForReader & data_part_info_for_reader, MergeTreeMarksLoader & marks_loader, const ColumnPositions & column_positions, const MarkRanges & mark_ranges); From 1ebcc3a14ee7bd60ea5819648fb5c44360e1e07c Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 6 Sep 2022 07:41:37 +0000 Subject: [PATCH 12/24] fix: endswidth --> endswith --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index 363be9ab2dd..de41b23acb0 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -142,7 +142,7 @@ def parse_env_variables( is_cross_arm = compiler.endswith(ARM_SUFFIX) is_cross_ppc = compiler.endswith(PPC_SUFFIX) is_cross_freebsd = compiler.endswith(FREEBSD_SUFFIX) - is_amd64_sse2 = compiler.endswidth(AMD64_SSE2_SUFFIX) + is_amd64_sse2 = compiler.endswith(AMD64_SSE2_SUFFIX) if is_cross_darwin: cc = compiler[: -len(DARWIN_SUFFIX)] From 652f1bfd19a086349723b39a13054a1b9da586d5 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 6 Sep 2022 12:18:11 +0000 Subject: [PATCH 13/24] fix: pass -DNO_SSE3_OR_HIGHER=1 from packager --- docker/packager/packager | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/packager/packager b/docker/packager/packager index de41b23acb0..591262959b4 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -191,6 +191,7 @@ def parse_env_variables( elif is_amd64_sse2: cc = compiler[: -len(AMD64_SSE2_SUFFIX)] result.append("DEB_ARCH=amd64") + cmake_flags.append("-DNO_SSE3_OR_HIGHER=1") else: cc = compiler result.append("DEB_ARCH=amd64") From 6ced4131caf290bf6b5e47f9decac43b5a261320 Mon Sep 17 00:00:00 2001 From: Frank Chen Date: Tue, 6 Sep 2022 22:00:00 +0800 Subject: [PATCH 14/24] exception safe Signed-off-by: Frank Chen --- src/Common/OpenTelemetryTraceContext.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index c05d3385bc8..9efb278a670 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -283,10 +283,6 @@ TracingContextHolder::TracingContextHolder( this->root_span.start_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); - /// This object is created to initialize tracing context on a new thread, - /// it's helpful to record the thread_id so that we know the thread switching from the span log - this->root_span.addAttribute("clickhouse.thread_id", getThreadId()); - /// set up trace context on current thread current_thread_trace_context = _parent_trace_context; current_thread_trace_context.span_id = this->root_span.span_id; @@ -306,6 +302,18 @@ TracingContextHolder::~TracingContextHolder() auto shared_span_log = current_thread_trace_context.span_log.lock(); if (shared_span_log) { + try + { + /// This object is created to initialize tracing context on a new thread, + /// it's helpful to record the thread_id so that we know the thread switching from the span log + this->root_span.addAttribute("clickhouse.thread_id", getThreadId()); + } + catch (...) + { + /// It's acceptable that the attribute is not recorded in case of any exception, + /// so the exception is ignored to try to log the span. + } + this->root_span.finish_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); From cc1bd3ac362151a2e8f57025440d12b29efa5c23 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 6 Sep 2022 16:15:50 +0000 Subject: [PATCH 15/24] fix: disable vectorscan when building w/o SSE >=3 --- contrib/vectorscan-cmake/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/vectorscan-cmake/CMakeLists.txt b/contrib/vectorscan-cmake/CMakeLists.txt index bc17105be99..f9f46d9a8cf 100644 --- a/contrib/vectorscan-cmake/CMakeLists.txt +++ b/contrib/vectorscan-cmake/CMakeLists.txt @@ -1,6 +1,6 @@ # We use vectorscan, a portable and API/ABI-compatible drop-in replacement for hyperscan. -if (ARCH_AMD64) +if (ARCH_AMD64 AND NOT NO_SSE3_OR_HIGHER) option (ENABLE_VECTORSCAN "Enable vectorscan library" ${ENABLE_LIBRARIES}) endif() From d054ffd1109ab93ec366a64558c7c983cd23a11e Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 6 Sep 2022 18:23:19 +0000 Subject: [PATCH 16/24] fix: don't force-inline SSE3 code into generic code Force-inlining code compiled for SSE3 into "generic" (non-platform-specific) code works for standard x86 builds where everything is compiled with SSE 4.2 (and smaller). It no longer works if we compile everything only with SSE 2. --- src/Functions/GatherUtils/sliceHasImplAnyAll.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 21c80b742fd..3ca8c6be9a7 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -419,7 +419,7 @@ DECLARE_SSE42_SPECIFIC_CODE ( // SSE4.2 Int64, UInt64 specialization template requires (std::is_same_v || std::is_same_v) -inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt64( +inline bool sliceHasImplAnyAllImplInt64( const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, @@ -495,7 +495,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt64( // SSE4.2 Int32, UInt32 specialization template requires (std::is_same_v || std::is_same_v) -inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt32( +inline bool sliceHasImplAnyAllImplInt32( const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, @@ -580,7 +580,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt32( // SSE4.2 Int16, UInt16 specialization template requires (std::is_same_v || std::is_same_v) -inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt16( +inline bool sliceHasImplAnyAllImplInt16( const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, @@ -682,7 +682,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt16( // SSE2 Int8, UInt8 specialization template requires (std::is_same_v || std::is_same_v) -inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt8( +inline bool sliceHasImplAnyAllImplInt8( const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, From de8f6bdce790df57dc89841620edd4ff1c08a9ef Mon Sep 17 00:00:00 2001 From: Frank Chen Date: Wed, 7 Sep 2022 13:39:12 +0800 Subject: [PATCH 17/24] More safe Signed-off-by: Frank Chen --- src/Common/OpenTelemetryTraceContext.cpp | 130 ++++++++++++++--------- src/Common/OpenTelemetryTraceContext.h | 2 +- 2 files changed, 82 insertions(+), 50 deletions(-) diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 9efb278a670..9230e727eaf 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -88,7 +88,13 @@ void Span::addAttribute(std::exception_ptr e) noexcept SpanHolder::SpanHolder(std::string_view _operation_name) { - if (current_thread_trace_context.isTraceEnabled()) + if (!current_thread_trace_context.isTraceEnabled()) + { + return; + } + + /// Use try-catch to make sure the ctor is exception safe. + try { this->trace_id = current_thread_trace_context.trace_id; this->parent_span_id = current_thread_trace_context.span_id; @@ -97,9 +103,19 @@ SpanHolder::SpanHolder(std::string_view _operation_name) this->start_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); - // set current span id to this - current_thread_trace_context.span_id = this->span_id; + /// Add new intialization here } + catch (...) + { + tryLogCurrentException(__FUNCTION__); + + /// Clear related fields to make sure the span won't be recorded. + this->trace_id = UUID(); + return; + } + + /// Set current span as parent of other spans created later on this thread. + current_thread_trace_context.span_id = this->span_id; } void SpanHolder::finish() noexcept @@ -216,7 +232,7 @@ const TracingContextOnThread & CurrentContext() return current_thread_trace_context; } -void TracingContextOnThread::reset() +void TracingContextOnThread::reset() noexcept { this->trace_id = UUID(); this->span_id = 0; @@ -231,59 +247,75 @@ TracingContextHolder::TracingContextHolder( const Settings * settings_ptr, const std::weak_ptr & _span_log) { - if (current_thread_trace_context.isTraceEnabled()) + /// Use try-catch to make sure the ctor is exception safe. + /// If any exception is raised during the construction, the tracing is not enabled on current thread. + try { - /// - /// This is not the normal case, - /// it means that construction of current object is not at the start of current thread. - /// Usually this is due to: - /// 1. bad design - /// 2. right design but code changes so that original point where this object is constructing is not the new start execution of current thread - /// - /// In such case, we should use current context as parent of this new constructing object, - /// So this branch ensures this class can be instantiated multiple times on one same thread safely. - /// - this->is_context_owner = false; - this->root_span.trace_id = current_thread_trace_context.trace_id; - this->root_span.parent_span_id = current_thread_trace_context.span_id; + if (current_thread_trace_context.isTraceEnabled()) + { + /// + /// This is not the normal case, + /// it means that construction of current object is not at the start of current thread. + /// Usually this is due to: + /// 1. bad design + /// 2. right design but code changes so that original point where this object is constructing is not the new start execution of current thread + /// + /// In such case, we should use current context as parent of this new constructing object, + /// So this branch ensures this class can be instantiated multiple times on one same thread safely. + /// + this->is_context_owner = false; + this->root_span.trace_id = current_thread_trace_context.trace_id; + this->root_span.parent_span_id = current_thread_trace_context.span_id; + this->root_span.span_id = thread_local_rng(); + this->root_span.operation_name = _operation_name; + this->root_span.start_time_us + = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); + + /// Set the root span as parent of other spans created on current thread + current_thread_trace_context.span_id = this->root_span.span_id; + return; + } + + if (!_parent_trace_context.isTraceEnabled()) + { + if (settings_ptr == nullptr) + /// Skip tracing context initialization on current thread + return; + + // Start the trace with some configurable probability. + std::bernoulli_distribution should_start_trace{settings_ptr->opentelemetry_start_trace_probability}; + if (!should_start_trace(thread_local_rng)) + /// skip tracing context initialization on current thread + return; + + while (_parent_trace_context.trace_id == UUID()) + { + // Make sure the random generated trace_id is not 0 which is an invalid id. + _parent_trace_context.trace_id.toUnderType().items[0] = thread_local_rng(); //-V656 + _parent_trace_context.trace_id.toUnderType().items[1] = thread_local_rng(); //-V656 + } + _parent_trace_context.span_id = 0; + } + + this->root_span.trace_id = _parent_trace_context.trace_id; + this->root_span.parent_span_id = _parent_trace_context.span_id; this->root_span.span_id = thread_local_rng(); this->root_span.operation_name = _operation_name; this->root_span.start_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); - current_thread_trace_context.span_id = this->root_span.span_id; + /// Add new initialization here + } + catch(...) + { + tryLogCurrentException(__FUNCTION__); + + /// Clear related fields to make sure the tracing is not enabled. + this->root_span.trace_id = UUID(); return; } - if (!_parent_trace_context.isTraceEnabled()) - { - if (settings_ptr == nullptr) - /// skip tracing context initialization on current thread - return; - - // start the trace ourselves, with some configurable probability. - std::bernoulli_distribution should_start_trace{settings_ptr->opentelemetry_start_trace_probability}; - if (!should_start_trace(thread_local_rng)) - /// skip tracing context initialization on current thread - return; - - while (_parent_trace_context.trace_id == UUID()) - { - // make sure the random generated trace_id is not 0 which is an invalid id - _parent_trace_context.trace_id.toUnderType().items[0] = thread_local_rng(); //-V656 - _parent_trace_context.trace_id.toUnderType().items[1] = thread_local_rng(); //-V656 - } - _parent_trace_context.span_id = 0; - } - - this->root_span.trace_id = _parent_trace_context.trace_id; - this->root_span.parent_span_id = _parent_trace_context.span_id; - this->root_span.span_id = thread_local_rng(); - this->root_span.operation_name = _operation_name; - this->root_span.start_time_us - = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); - - /// set up trace context on current thread + /// Set up trace context on current thread only when the root span is successfully intialized. current_thread_trace_context = _parent_trace_context; current_thread_trace_context.span_id = this->root_span.span_id; current_thread_trace_context.trace_flags = TRACE_FLAG_SAMPLED; @@ -313,7 +345,7 @@ TracingContextHolder::~TracingContextHolder() /// It's acceptable that the attribute is not recorded in case of any exception, /// so the exception is ignored to try to log the span. } - + this->root_span.finish_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); diff --git a/src/Common/OpenTelemetryTraceContext.h b/src/Common/OpenTelemetryTraceContext.h index 3964b5030fc..63136f8731d 100644 --- a/src/Common/OpenTelemetryTraceContext.h +++ b/src/Common/OpenTelemetryTraceContext.h @@ -74,7 +74,7 @@ struct TracingContextOnThread : TracingContext return *this; } - void reset(); + void reset() noexcept; /// Use weak_ptr instead of shared_ptr to hold a reference to the underlying system.opentelemetry_span_log table /// Since this object is kept on threads and passed across threads, a weak_ptr is more safe to prevent potential leak From fc05b05be3210667585fa490146bf659ede1d1f4 Mon Sep 17 00:00:00 2001 From: Frank Chen Date: Wed, 7 Sep 2022 15:14:43 +0800 Subject: [PATCH 18/24] Fix style and typo Signed-off-by: Frank Chen --- src/Common/OpenTelemetryTraceContext.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 9230e727eaf..7a1f94926d5 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -103,7 +103,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name) this->start_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); - /// Add new intialization here + /// Add new initialization here } catch (...) { @@ -306,7 +306,7 @@ TracingContextHolder::TracingContextHolder( /// Add new initialization here } - catch(...) + catch (...) { tryLogCurrentException(__FUNCTION__); @@ -315,7 +315,7 @@ TracingContextHolder::TracingContextHolder( return; } - /// Set up trace context on current thread only when the root span is successfully intialized. + /// Set up trace context on current thread only when the root span is successfully initialized. current_thread_trace_context = _parent_trace_context; current_thread_trace_context.span_id = this->root_span.span_id; current_thread_trace_context.trace_flags = TRACE_FLAG_SAMPLED; From bf8fed8be8cac07ef0cc0b56e05e3d6631976c27 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 7 Sep 2022 09:45:07 +0000 Subject: [PATCH 19/24] Revert "fix: don't force-inline SSE3 code into generic code" This reverts commit d054ffd1109ab93ec366a64558c7c983cd23a11e. --- src/Functions/GatherUtils/sliceHasImplAnyAll.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 3ca8c6be9a7..21c80b742fd 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -419,7 +419,7 @@ DECLARE_SSE42_SPECIFIC_CODE ( // SSE4.2 Int64, UInt64 specialization template requires (std::is_same_v || std::is_same_v) -inline bool sliceHasImplAnyAllImplInt64( +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt64( const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, @@ -495,7 +495,7 @@ inline bool sliceHasImplAnyAllImplInt64( // SSE4.2 Int32, UInt32 specialization template requires (std::is_same_v || std::is_same_v) -inline bool sliceHasImplAnyAllImplInt32( +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt32( const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, @@ -580,7 +580,7 @@ inline bool sliceHasImplAnyAllImplInt32( // SSE4.2 Int16, UInt16 specialization template requires (std::is_same_v || std::is_same_v) -inline bool sliceHasImplAnyAllImplInt16( +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt16( const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, @@ -682,7 +682,7 @@ inline bool sliceHasImplAnyAllImplInt16( // SSE2 Int8, UInt8 specialization template requires (std::is_same_v || std::is_same_v) -inline bool sliceHasImplAnyAllImplInt8( +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt8( const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, From c07f234f095f575f2fb6902b007c5b736061ec48 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 7 Sep 2022 10:01:51 +0000 Subject: [PATCH 20/24] fix: disable ENABLE_MULTITARGET_CODE for SSE2 builds --- src/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fd8771c1529..3dc42746d67 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -42,6 +42,14 @@ endif () # See `src/Common/TargetSpecific.h` option(ENABLE_MULTITARGET_CODE "Enable platform-dependent code" ON) +if (NO_SSE3_OR_HIGHER) + # Optimized x86 code in DECLARE_*_SPECIFIC_CODE blocks (see `src/Common/TargetSpecific.h`) is sometimes marked FORCE_INLINE. As a + # result, its instruction set requirements (e.g. SSE4.2) leak into generic code. This is normally not a problem for standard x86 builds + # because generic code is compiled with SSE 4.2 anyways. But it breaks SSE2-only builds. Therefore disabling the multitarget code + # machinery and always use generic code. (The cleaner alternative is removing FORCE_INLINE but that impacts performance too much.) + set(ENABLE_MULTITARGET_CODE OFF) +endif() + if (ENABLE_MULTITARGET_CODE) add_definitions(-DENABLE_MULTITARGET_CODE=1) else() From 1ae54d3d16cef85f53b87f48b453cd985a04d0af Mon Sep 17 00:00:00 2001 From: peter279k Date: Thu, 8 Sep 2022 01:18:27 +0800 Subject: [PATCH 21/24] Improve clickhouse start command --- docs/en/getting-started/install.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md index 1a17e63a274..92873cb5fbf 100644 --- a/docs/en/getting-started/install.md +++ b/docs/en/getting-started/install.md @@ -221,7 +221,7 @@ For non-Linux operating systems and for AArch64 CPU architecture, ClickHouse bui curl -O 'https://builds.clickhouse.com/master/aarch64/clickhouse' && chmod a+x ./clickhouse ``` -Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it. +Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `sudo clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it. Use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data. From 0071ef9e389bf2ce2076b0fe5268027da9f9c7a3 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Wed, 7 Sep 2022 15:56:31 -0300 Subject: [PATCH 22/24] Update date-time-functions.md --- .../functions/date-time-functions.md | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 3515e903adf..49fbb961af8 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -640,7 +640,8 @@ Result: ## date\_diff -Returns the difference between two dates or dates with time values. +Returns the difference between two dates or dates with time values. +The difference is calculated using relative units, e.g. the difference between `2022-01-01` and `2021-12-29` is 3 days for day unit (see (toRelativeDayNum)[#toRelativeDayNum]), 1 month for month unit (see (toRelativeMonthNum)[#toRelativeMonthNum]), 1 year for year unit (see (toRelativeYearNum)[#toRelativeYearNum]). **Syntax** @@ -692,6 +693,25 @@ Result: └────────────────────────────────────────────────────────────────────────────────────────┘ ``` +Query: + +``` sql +SELECT + toDate('2022-01-01') AS e, + toDate('2021-12-29') AS s, + dateDiff('day', s, e) AS day_diff, + dateDiff('month', s, e) AS month__diff, + dateDiff('year', s, e) AS year_diff; +``` + +Result: + +``` text +┌──────────e─┬──────────s─┬─day_diff─┬─month__diff─┬─year_diff─┐ +│ 2022-01-01 │ 2021-12-29 │ 3 │ 1 │ 1 │ +└────────────┴────────────┴──────────┴─────────────┴───────────┘ +``` + ## date\_sub Subtracts the time interval or date interval from the provided date or date with time. From fb6b26c7a48ae2688d106a4b69d76fd5a03d215c Mon Sep 17 00:00:00 2001 From: Yuko Takagi <70714860+yukotakagi@users.noreply.github.com> Date: Wed, 7 Sep 2022 12:58:36 -0600 Subject: [PATCH 23/24] Update README.md (#41091) --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b173add94e3..49aed14f719 100644 --- a/README.md +++ b/README.md @@ -15,4 +15,5 @@ ClickHouse® is an open-source column-oriented database management system that a * [Contacts](https://clickhouse.com/company/contact) can help to get your questions answered if there are any. ## Upcoming events -* [**v22.8 Release Webinar**](https://clickhouse.com/company/events/v22-8-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap. +* [**v22.9 Release Webinar**](https://clickhouse.com/company/events/v22-9-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap. +* [**ClickHouse for Analytics @ Barracuda Networks**](https://www.meetup.com/clickhouse-silicon-valley-meetup-group/events/288140358/) Join us for this in person meetup hosted by our friends at Barracuda in Bay Area. From a75eb5ad84309227fd2721313f8d7a7754557032 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Wed, 7 Sep 2022 15:59:23 -0300 Subject: [PATCH 24/24] Update date-time-functions.md --- docs/en/sql-reference/functions/date-time-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 49fbb961af8..2a2681ca1e8 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -641,7 +641,7 @@ Result: ## date\_diff Returns the difference between two dates or dates with time values. -The difference is calculated using relative units, e.g. the difference between `2022-01-01` and `2021-12-29` is 3 days for day unit (see (toRelativeDayNum)[#toRelativeDayNum]), 1 month for month unit (see (toRelativeMonthNum)[#toRelativeMonthNum]), 1 year for year unit (see (toRelativeYearNum)[#toRelativeYearNum]). +The difference is calculated using relative units, e.g. the difference between `2022-01-01` and `2021-12-29` is 3 days for day unit (see [toRelativeDayNum](#toRelativeDayNum)), 1 month for month unit (see [toRelativeMonthNum](#toRelativeMonthNum)), 1 year for year unit (see [toRelativeYearNum](#toRelativeYearNum)). **Syntax**